diff options
Diffstat (limited to 'tools/llvm-mca')
-rw-r--r-- | tools/llvm-mca/DispatchStage.cpp | 12 | ||||
-rw-r--r-- | tools/llvm-mca/DispatchStage.h | 2 | ||||
-rw-r--r-- | tools/llvm-mca/InstrBuilder.cpp | 4 | ||||
-rw-r--r-- | tools/llvm-mca/Instruction.h | 13 | ||||
-rw-r--r-- | tools/llvm-mca/README.txt | 865 | ||||
-rw-r--r-- | tools/llvm-mca/RetireControlUnit.h | 2 | ||||
-rw-r--r-- | tools/llvm-mca/RetireStage.cpp | 8 | ||||
-rw-r--r-- | tools/llvm-mca/llvm-mca.cpp | 2 |
8 files changed, 30 insertions, 878 deletions
diff --git a/tools/llvm-mca/DispatchStage.cpp b/tools/llvm-mca/DispatchStage.cpp index be6f1f89be5c..1f508886c298 100644 --- a/tools/llvm-mca/DispatchStage.cpp +++ b/tools/llvm-mca/DispatchStage.cpp @@ -107,17 +107,21 @@ void DispatchStage::dispatch(InstRef IR) { // instruction. A dependency-breaking instruction is a zero-latency // instruction that doesn't consume hardware resources. // An example of dependency-breaking instruction on X86 is a zero-idiom XOR. - if (!Desc.isZeroLatency()) - for (std::unique_ptr<ReadState> &RS : IS.getUses()) + bool IsDependencyBreaking = IS.isDependencyBreaking(); + for (std::unique_ptr<ReadState> &RS : IS.getUses()) + if (RS->isImplicitRead() || !IsDependencyBreaking) updateRAWDependencies(*RS, STI); // By default, a dependency-breaking zero-latency instruction is expected to // be optimized at register renaming stage. That means, no physical register // is allocated to the instruction. + bool ShouldAllocateRegisters = + !(Desc.isZeroLatency() && IsDependencyBreaking); SmallVector<unsigned, 4> RegisterFiles(PRF.getNumRegisterFiles()); - for (std::unique_ptr<WriteState> &WS : IS.getDefs()) + for (std::unique_ptr<WriteState> &WS : IS.getDefs()) { PRF.addRegisterWrite(WriteRef(IR.first, WS.get()), RegisterFiles, - !Desc.isZeroLatency()); + ShouldAllocateRegisters); + } // Reserve slots in the RCU, and notify the instruction that it has been // dispatched to the schedulers for execution. diff --git a/tools/llvm-mca/DispatchStage.h b/tools/llvm-mca/DispatchStage.h index f21789a29c50..4262a241c08c 100644 --- a/tools/llvm-mca/DispatchStage.h +++ b/tools/llvm-mca/DispatchStage.h @@ -38,7 +38,7 @@ class Scheduler; // the following conditions are met: // 1) There are enough entries in the reorder buffer (see class // RetireControlUnit) to write the opcodes associated with the instruction. -// 2) There are enough temporaries to rename output register operands. +// 2) There are enough physical registers to rename output register operands. // 3) There are enough entries available in the used buffered resource(s). // // The number of micro opcodes that can be dispatched in one cycle is limited by diff --git a/tools/llvm-mca/InstrBuilder.cpp b/tools/llvm-mca/InstrBuilder.cpp index dbd457196f9d..053b7b4e8175 100644 --- a/tools/llvm-mca/InstrBuilder.cpp +++ b/tools/llvm-mca/InstrBuilder.cpp @@ -443,6 +443,10 @@ InstrBuilder::createInstruction(const MCInst &MCI) { // register writes implicitly clear the upper portion of a super-register. MCIA.clearsSuperRegisters(MRI, MCI, WriteMask); + // Check if this is a dependency breaking instruction. + if (MCIA.isDependencyBreaking(STI, MCI)) + NewIS->setDependencyBreaking(); + // Initialize writes. unsigned WriteIndex = 0; for (const WriteDescriptor &WD : D.Writes) { diff --git a/tools/llvm-mca/Instruction.h b/tools/llvm-mca/Instruction.h index ddf5c3a5e33f..3b2f90528f2e 100644 --- a/tools/llvm-mca/Instruction.h +++ b/tools/llvm-mca/Instruction.h @@ -170,8 +170,6 @@ class ReadState { bool IsReady; public: - bool isReady() const { return IsReady; } - ReadState(const ReadDescriptor &Desc, unsigned RegID) : RD(Desc), RegisterID(RegID), DependentWrites(0), CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true) {} @@ -182,6 +180,9 @@ public: unsigned getSchedClass() const { return RD.SchedClassID; } unsigned getRegisterID() const { return RegisterID; } + bool isReady() const { return IsReady; } + bool isImplicitRead() const { return RD.isImplicitRead(); } + void cycleEvent(); void writeStartEvent(unsigned Cycles); void setDependentWrites(unsigned Writes) { @@ -299,6 +300,8 @@ class Instruction { // Retire Unit token ID for this instruction. unsigned RCUTokenID; + bool IsDepBreaking; + using UniqueDef = std::unique_ptr<WriteState>; using UniqueUse = std::unique_ptr<ReadState>; using VecDefs = std::vector<UniqueDef>; @@ -314,7 +317,8 @@ class Instruction { public: Instruction(const InstrDesc &D) - : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES) {} + : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0), + IsDepBreaking(false) {} Instruction(const Instruction &Other) = delete; Instruction &operator=(const Instruction &Other) = delete; @@ -326,6 +330,9 @@ public: unsigned getRCUTokenID() const { return RCUTokenID; } int getCyclesLeft() const { return CyclesLeft; } + bool isDependencyBreaking() const { return IsDepBreaking; } + void setDependencyBreaking() { IsDepBreaking = true; } + unsigned getNumUsers() const { unsigned NumUsers = 0; for (const UniqueDef &Def : Defs) diff --git a/tools/llvm-mca/README.txt b/tools/llvm-mca/README.txt deleted file mode 100644 index 8b1670db0fca..000000000000 --- a/tools/llvm-mca/README.txt +++ /dev/null @@ -1,865 +0,0 @@ -llvm-mca - LLVM Machine Code Analyzer -------------------------------------- - -llvm-mca is a performance analysis tool that uses information which is already -available in LLVM (e.g., scheduling models) to statically measure the -performance of machine code in a specific cpu. - -Performance is measured in terms of throughput as well as processor resource -consumption. The tool currently works for processors with an out-of-order -backend, for which there is a scheduling model available in LLVM. - -The main goal of this tool is not just to predict the performance of the code -when run on the target, but also help with diagnosing potential performance -issues. - -Given an assembly code sequence, llvm-mca estimates the IPC (instructions Per -cycle), as well as hardware resources pressure. The analysis and reporting style -were inspired by the IACA tool from Intel. - -The presence of long data dependency chains, as well as poor usage of hardware -resources may lead to bottlenecks in the backend. The tool is able to generate -a detailed report which should help with identifying and analyzing sources of -bottlenecks. - -Scheduling models are mostly used to compute instruction latencies, to obtain -read-advance information, and understand how processor resources are used by -instructions. By design, the quality of the performance analysis conducted by -the tool is inevitably affected by the quality of the target scheduling models. -However, scheduling models intentionally do not describe all processor details, -since the goal is just to enable the scheduling of machine instructions during -compilation. That means, there are processor details which are not important for -the purpose of scheduling instructions (and therefore not described by the -scheduling model), but are very important for this tool. - -A few examples of details that are missing in scheduling models are: - - Actual dispatch width (it often differs from the issue width). - - Number of read/write ports in the register file(s). - - Length of the load/store queue in the LSUnit. - -It is also very difficult to find a "good" abstract model to describe the -behavior of out-of-order processors. So, we have to keep in mind that all of -these aspects are going to affect the quality of the static analysis performed -by the tool. - -An extensive list of known limitations is reported in one of the last sections -of this document. There is also a section related to design problems which must -be addressed (hopefully with the help of the community). At the moment, the -tool has been mostly tested for x86 targets, but there are still several -limitations, some of which could be overcome by integrating extra information -into the scheduling models. - -How the tool works ------------------- - -The tool takes assembly code as input. Assembly code is parsed into a sequence -of MCInst with the help of the existing LLVM target assembly parsers. The parsed -sequence of MCInst is then analyzed by a 'Pipeline' module to generate a -performance report. - -The Pipeline module internally emulates the execution of the machine code -sequence in a loop of iterations (which by default is 100). At the end of this -process, the pipeline collects a number of statistics which are then printed out -in the form of a report. - -Here is an example of performance report generated by the tool for a dot-product -of two packed float vectors of four elements. The analysis is conducted for -target x86, cpu btver2: - -/////////////////// - -Iterations: 300 -Instructions: 900 -Total Cycles: 610 -Dispatch Width: 2 -IPC: 1.48 - - -Resources: -[0] - JALU0 -[1] - JALU1 -[2] - JDiv -[3] - JFPM -[4] - JFPU0 -[5] - JFPU1 -[6] - JLAGU -[7] - JSAGU -[8] - JSTC -[9] - JVIMUL - - -Resource pressure per iteration: -[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] - - - - - 2.00 1.00 - - - - - -Resource pressure by instruction: -[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: - - - - - - 1.00 - - - - vmulps %xmm0, %xmm1, %xmm2 - - - - - 1.00 - - - - - vhaddps %xmm2, %xmm2, %xmm3 - - - - - 1.00 - - - - - vhaddps %xmm3, %xmm3, %xmm4 - - -Instruction Info: -[1]: #uOps -[2]: Latency -[3]: RThroughput -[4]: MayLoad -[5]: MayStore -[6]: HasSideEffects - -[1] [2] [3] [4] [5] [6] Instructions: - 1 2 1.00 vmulps %xmm0, %xmm1, %xmm2 - 1 3 1.00 vhaddps %xmm2, %xmm2, %xmm3 - 1 3 1.00 vhaddps %xmm3, %xmm3, %xmm4 - -/////////////////// - -According to this report, the dot-product kernel has been executed 300 times, -for a total of 900 instructions dynamically executed. - -The report is structured in three main sections. A first section collects a few -performance numbers; the goal of this section is to give a very quick overview -of the performance throughput. In this example, the two important performance -indicators are a) the predicted total number of cycles, and b) the IPC. -IPC is probably the most important throughput indicator. A big delta between the -Dispatch Width and the computed IPC is an indicator of potential performance -issues. - -The second section is the so-called "resource pressure view". This view reports -the average number of resource cycles consumed every iteration by instructions -for every processor resource unit available on the target. Information is -structured in two tables. The first table reports the number of resource cycles -spent on average every iteration. The second table correlates the resource -cycles to the machine instruction in the sequence. For example, every iteration -of the dot-product, instruction 'vmulps' always executes on resource unit [5] -(JFPU1 - floating point pipeline #1), consuming an average of 1 resource cycle -per iteration. Note that on Jaguar, vector FP multiply can only be issued to -pipeline JFPU1, while horizontal FP adds can only be issued to pipeline JFPU0. - -The third (and last) section of the report shows the latency and reciprocal -throughput of every instruction in the sequence. That section also reports extra -information related to the number of micro opcodes, and opcode properties (i.e., -'MayLoad', 'MayStore', and 'UnmodeledSideEffects'). - -The resource pressure view helps with identifying bottlenecks caused by high -usage of specific hardware resources. Situations with resource pressure mainly -concentrated on a few resources should, in general, be avoided. Ideally, -pressure should be uniformly distributed between multiple resources. - -Timeline View -------------- - -A detailed report of each instruction's state transitions over time can be -enabled using the command line flag '-timeline'. This prints an extra section -in the report which contains the so-called "timeline view". Below is the -timeline view for the dot-product example from the previous section. - -/////////////// -Timeline view: - 012345 -Index 0123456789 - -[0,0] DeeER. . . vmulps %xmm0, %xmm1, %xmm2 -[0,1] D==eeeER . . vhaddps %xmm2, %xmm2, %xmm3 -[0,2] .D====eeeER . vhaddps %xmm3, %xmm3, %xmm4 - -[1,0] .DeeE-----R . vmulps %xmm0, %xmm1, %xmm2 -[1,1] . D=eeeE---R . vhaddps %xmm2, %xmm2, %xmm3 -[1,2] . D====eeeER . vhaddps %xmm3, %xmm3, %xmm4 - -[2,0] . DeeE-----R . vmulps %xmm0, %xmm1, %xmm2 -[2,1] . D====eeeER . vhaddps %xmm2, %xmm2, %xmm3 -[2,2] . D======eeeER vhaddps %xmm3, %xmm3, %xmm4 - - -Average Wait times (based on the timeline view): -[0]: Executions -[1]: Average time spent waiting in a scheduler's queue -[2]: Average time spent waiting in a scheduler's queue while ready -[3]: Average time elapsed from WB until retire stage - - [0] [1] [2] [3] -0. 3 1.0 1.0 3.3 vmulps %xmm0, %xmm1, %xmm2 -1. 3 3.3 0.7 1.0 vhaddps %xmm2, %xmm2, %xmm3 -2. 3 5.7 0.0 0.0 vhaddps %xmm3, %xmm3, %xmm4 -/////////////// - -The timeline view is very interesting because it shows how instructions changed -in state during execution. It also gives an idea of how the tool "sees" -instructions executed on the target. - -The timeline view is structured in two tables. The first table shows how -instructions change in state over time (measured in cycles); the second table -(named "Average Wait times") reports useful timing statistics which should help -diagnose performance bottlenecks caused by long data dependencies and -sub-optimal usage of hardware resources. - -An instruction in the timeline view is identified by a pair of indices, where -the 'first' index identifies an iteration, and the 'second' index is the actual -instruction index (i.e., where it appears in the code sequence). - -Excluding the first and last column, the remaining columns are in cycles. -Cycles are numbered sequentially starting from 0. The following characters are -used to describe the state of an instruction: - - D : Instruction dispatched. - e : Instruction executing. - E : Instruction executed. - R : Instruction retired. - = : Instruction already dispatched, waiting to be executed. - - : Instruction executed, waiting to be retired. - -Based on the timeline view from the example, we know that: - - Instruction [1, 0] was dispatched at cycle 1. - - Instruction [1, 0] started executing at cycle 2. - - Instruction [1, 0] reached the write back stage at cycle 4. - - Instruction [1, 0] was retired at cycle 10. - -Instruction [1, 0] (i.e., the vmulps from iteration #1) doesn't have to wait in -the scheduler's queue for the operands to become available. By the time the -vmulps is dispatched, operands are already available, and pipeline JFPU1 is -ready to serve another instruction. So the instruction can be immediately -issued on the JFPU1 pipeline. That is demonstrated by the fact that the -instruction only spent 1cy in the scheduler's queue. - -There is a gap of 5 cycles between the write-back stage and the retire event. -That is because instructions must retire in program order, so [1,0] has to wait -for [0, 2] to be retired first (i.e., it has to wait until cycle 10). - -In the dot-product example, all instructions are in a RAW (Read After Write) -dependency chain. Register %xmm2 written by the vmulps is immediately used by -the first vhaddps, and register %xmm3 written by the first vhaddps is used by -the second vhaddps. Long data dependencies negatively affect the ILP -(Instruction Level Parallelism). - -In the dot-product example, there are anti-dependencies introduced by -instructions from different iterations. However, those dependencies can be -removed at register renaming stage (at the cost of allocating register aliases, -and therefore consuming temporary registers). - -Table "Average Wait times" helps diagnose performance issues that are caused by -the presence of long latency instructions and potentially long data dependencies -which may limit the ILP. Note that the tool by default assumes at least 1cy -between the dispatch event and the issue event. - -When the performance is limited by data dependencies and/or long latency -instructions, the number of cycles spent while in the "ready" state is expected -to be very small when compared with the total number of cycles spent in the -scheduler's queue. So the difference between the two counters is a good -indicator of how big of an impact data dependencies had on the execution of -instructions. When performance is mostly limited by the lack of hardware -resources, the delta between the two counters is small. However, the number of -cycles spent in the queue tends to be bigger (i.e., more than 1-3cy) especially -when compared with other low latency instructions. - -Extra statistics to further diagnose performance issues. --------------------------------------------------------- - -Flag '-verbose' enables extra statistics and performance counters for the -dispatch logic, the reorder buffer, the retire control unit and the register -file. - -Below is an example of verbose output generated by the tool for the dot-product -example discussed in the previous sections. - -/////////////////// -Iterations: 300 -Instructions: 900 -Total Cycles: 610 -Dispatch Width: 2 -IPC: 1.48 - - -Dynamic Dispatch Stall Cycles: -RAT - Register unavailable: 0 -RCU - Retire tokens unavailable: 0 -SCHEDQ - Scheduler full: 272 -LQ - Load queue full: 0 -SQ - Store queue full: 0 -GROUP - Static restrictions on the dispatch group: 0 - - -Register Alias Table: -Total number of mappings created: 900 -Max number of mappings used: 35 - - -Dispatch Logic - number of cycles where we saw N instructions dispatched: -[# dispatched], [# cycles] - 0, 24 (3.9%) - 1, 272 (44.6%) - 2, 314 (51.5%) - - -Schedulers - number of cycles where we saw N instructions issued: -[# issued], [# cycles] - 0, 7 (1.1%) - 1, 306 (50.2%) - 2, 297 (48.7%) - - -Retire Control Unit - number of cycles where we saw N instructions retired: -[# retired], [# cycles] - 0, 109 (17.9%) - 1, 102 (16.7%) - 2, 399 (65.4%) - - -Scheduler's queue usage: -JALU01, 0/20 -JFPU01, 18/18 -JLSAGU, 0/12 -/////////////////// - -Based on the verbose report, the pipeline was only able to dispatch two -instructions 51.5% of the time. The dispatch group was limited to one -instruction 44.6% of the cycles, which corresponds to 272 cycles. - -If we look at section "Dynamic Dispatch Stall Cycles", we can see how counter -SCHEDQ reports 272 cycles. Counter SCHEDQ is incremented every time the -dispatch logic is unable to dispatch a full group of two instructions because -the scheduler's queue is full. - -Section "Scheduler's queue usage" shows how the maximum number of buffer entries -(i.e., scheduler's queue entries) used at runtime for resource JFPU01 reached -its maximum. Note that AMD Jaguar implements three schedulers: - * JALU01 - A scheduler for ALU instructions - * JLSAGU - A scheduler for address generation - * JFPU01 - A scheduler floating point operations. - -The dot-product is a kernel of three floating point instructions (a vector -multiply followed by two horizontal adds). That explains why only the floating -point scheduler appears to be used according to section "Scheduler's queue -usage". - -A full scheduler's queue is either caused by data dependency chains, or by a -sub-optimal usage of hardware resources. Sometimes, resource pressure can be -mitigated by rewriting the kernel using different instructions that consume -different scheduler resources. Schedulers with a small queue are less resilient -to bottlenecks caused by the presence of long data dependencies. - -In this example, we can conclude that the IPC is mostly limited by data -dependencies, and not by resource pressure. - -LLVM-MCA instruction flow -------------------------- - -This section describes the instruction flow through the out-of-order backend, -as well as the functional units involved in the process. - -An instruction goes through a default sequence of stages: - - Dispatch (Instruction is dispatched to the schedulers). - - Issue (Instruction is issued to the processor pipelines). - - Write Back (Instruction is executed, and results are written back). - - Retire (Instruction is retired; writes are architecturally committed). - -The tool only models the out-of-order portion of a processor. Therefore, the -instruction fetch and decode stages are not modeled. Performance bottlenecks in -the frontend are not diagnosed by this tool. The tool assumes that instructions -have all been decoded and placed in a queue. Also, the tool doesn't know -anything about branch prediction. - -The long term plan is to make the process customizable, so that processors can -define their own. This is a future work. - -Instruction Dispatch --------------------- - -During the Dispatch stage, instructions are picked in program order from a queue -of already decoded instructions, and dispatched in groups to the hardware -schedulers. The dispatch logic is implemented by class DispatchStage in file -DispatchStage.h. - -The size of a dispatch group depends on the availability of hardware resources, -and it cannot exceed the value of field 'DispatchWidth' in class DispatchStage. -Note that field DispatchWidth defaults to the value of field 'IssueWidth' from -the scheduling model. - -Users can override the DispatchWidth value with flag "-dispatch=<N>" (where 'N' -is an unsigned quantity). - -An instruction can be dispatched if: - - The size of the dispatch group is smaller than DispatchWidth - - There are enough entries in the reorder buffer - - There are enough temporary registers to do register renaming - - Schedulers are not full. - -Since r329067, scheduling models can now optionally specify which register -files are available on the processor. Class DispatchStage(see DispatchStage.h) -would use that information to initialize register file descriptors. - -By default, if the model doesn't describe register files, the tool -(optimistically) assumes a single register file with an unbounded number of -temporary registers. Users can limit the number of temporary registers that -are globally available for register renaming using flag -`-register-file-size=<N>`, where N is the number of temporaries. A value of -zero for N means 'unbounded'. Knowing how many temporaries are available for -register renaming, the tool can predict dispatch stalls caused by the lack of -temporaries. - -The number of reorder buffer entries consumed by an instruction depends on the -number of micro-opcodes it specifies in the target scheduling model (see field -'NumMicroOpcodes' of TableGen class ProcWriteResources and its derived classes; -TargetSchedule.td). - -The reorder buffer is implemented by class RetireControlUnit (see -DispatchStage.h). Its goal is to track the progress of instructions that are -"in-flight", and retire instructions in program order. The number of entries -in the reorder buffer defaults to the value of field 'MicroOpBufferSize' from -the target scheduling model. - -Instructions that are dispatched to the schedulers consume scheduler buffer -entries. The tool queries the scheduling model to figure out the set of -buffered resources consumed by an instruction. Buffered resources are treated -like "scheduler" resources, and the field 'BufferSize' (from the processor -resource TableGen definition) defines the size of the scheduler's queue. - -Zero latency instructions (for example NOP instructions) don't consume scheduler -resources. However, those instructions still reserve a number of slots in the -reorder buffer. - -Instruction Issue ------------------ - -As mentioned in the previous section, each scheduler resource implements a queue -of instructions. An instruction has to wait in the scheduler's queue until -input register operands become available. Only at that point, does the -instruction becomes eligible for execution and may be issued (potentially -out-of-order) to a pipeline for execution. - -Instruction latencies can be computed by the tool with the help of the -scheduling model; latency values are defined by the scheduling model through -ProcWriteResources objects. - -Class Scheduler (see file Scheduler.h) knows how to emulate multiple processor -schedulers. A Scheduler is responsible for tracking data dependencies, and -dynamically select which processor resources are consumed/used by instructions. - -Internally, the Scheduler class delegates the management of processor resource -units and resource groups to the ResourceManager class. ResourceManager is also -responsible for selecting resource units that are effectively consumed by -instructions. For example, if an instruction consumes 1cy of a resource group, -the ResourceManager object selects one of the available units from the group; by -default, it uses a round-robin selector to guarantee that resource usage is -uniformly distributed between all units of a group. - -Internally, class Scheduler implements three instruction queues: - - WaitQueue: a queue of instructions whose operands are not ready yet. - - ReadyQueue: a queue of instructions ready to execute. - - IssuedQueue: a queue of instructions executing. - -Depending on the operands availability, instructions that are dispatched to the -Scheduler are either placed into the WaitQueue or into the ReadyQueue. - -Every cycle, class Scheduler checks if instructions can be moved from the -WaitQueue to the ReadyQueue, and if instructions from the ReadyQueue can be -issued to the underlying pipelines. The algorithm prioritizes older -instructions over younger instructions. - -Objects of class ResourceState (see Scheduler.h) describe processor resources. -There is an instance of class ResourceState for each single processor resource -specified by the scheduling model. A ResourceState object for a processor -resource with multiple units dynamically tracks the availability of every single -unit. For example, the ResourceState of a resource group tracks the -availability of every resource in that group. Internally, ResourceState -implements a round-robin selector to dynamically pick the next unit to use from -the group. - -Write-Back and Retire Stage ---------------------------- - -Issued instructions are moved from the ReadyQueue to the IssuedQueue. There, -instructions wait until they reach the write-back stage. At that point, they -get removed from the queue and the retire control unit is notified. - -On the event of "instruction executed", the retire control unit flags the -instruction as "ready to retire". - -Instruction are retired in program order; an "instruction retired" event is sent -to the register file which frees the temporary registers allocated for the -instruction at register renaming stage. - -Load/Store Unit and Memory Consistency Model --------------------------------------------- - -The tool attempts to emulate out-of-order execution of memory operations. Class -LSUnit (see file LSUnit.h) emulates a load/store unit implementing queues for -speculative execution of loads and stores. - -Each load (or store) consumes an entry in the load (or store) queue. The number -of slots in the load/store queues is unknown by the tool, since there is no -mention of it in the scheduling model. In practice, users can specify flag -`-lqueue=N` (vic. `-squeue=N`) to limit the number of entries in the queue to be -equal to exactly N (an unsigned value). If N is zero, then the tool assumes an -unbounded queue (this is the default). - -LSUnit implements a relaxed consistency model for memory loads and stores. The -rules are: -1) A younger load is allowed to pass an older load only if there is no - intervening store in between the two loads. -2) An younger store is not allowed to pass an older store. -3) A younger store is not allowed to pass an older load. -4) A younger load is allowed to pass an older store provided that the load does - not alias with the store. - -By default, this class conservatively (i.e., pessimistically) assumes that loads -always may-alias store operations. Essentially, this LSUnit doesn't perform -any sort of alias analysis to rule out cases where loads and stores don't -overlap with each other. The downside of this approach however is that younger -loads are never allowed to pass older stores. To make it possible for a -younger load to pass an older store, users can use the command line flag --noalias. Under 'noalias', a younger load is always allowed to pass an older -store. - -Note that, in the case of write-combining memory, rule 2. could be relaxed a bit -to allow reordering of non-aliasing store operations. That being said, at the -moment, there is no way to further relax the memory model (flag -noalias is the -only option). Essentially, there is no option to specify a different memory -type (for example: write-back, write-combining, write-through; etc.) and -consequently to weaken or strengthen the memory model. - -Other limitations are: - * LSUnit doesn't know when store-to-load forwarding may occur. - * LSUnit doesn't know anything about the cache hierarchy and memory types. - * LSUnit doesn't know how to identify serializing operations and memory fences. - -No assumption is made on the store buffer size. As mentioned before, LSUnit -conservatively assumes a may-alias relation between loads and stores, and it -doesn't attempt to identify cases where store-to-load forwarding would occur in -practice. - -LSUnit doesn't attempt to predict whether a load or store hits or misses the L1 -cache. It only knows if an instruction "MayLoad" and/or "MayStore". For loads, -the scheduling model provides an "optimistic" load-to-use latency (which usually -matches the load-to-use latency for when there is a hit in the L1D). - -Class MCInstrDesc in LLVM doesn't know about serializing operations, nor -memory-barrier like instructions. LSUnit conservatively assumes that an -instruction which has both 'MayLoad' and 'UnmodeledSideEffects' behaves like a -"soft" load-barrier. That means, it serializes loads without forcing a flush of -the load queue. Similarly, instructions flagged with both 'MayStore' and -'UnmodeledSideEffects' are treated like store barriers. A full memory barrier -is a 'MayLoad' and 'MayStore' instruction with 'UnmodeledSideEffects'. This is -inaccurate, but it is the best that we can do at the moment with the current -information available in LLVM. - -A load/store barrier consumes one entry of the load/store queue. A load/store -barrier enforces ordering of loads/stores. A younger load cannot pass a load -barrier. Also, a younger store cannot pass a store barrier. A younger load has -to wait for the memory/load barrier to execute. A load/store barrier is -"executed" when it becomes the oldest entry in the load/store queue(s). That -also means, by construction, all the older loads/stores have been executed. - -In conclusion the full set of rules is: - 1. A store may not pass a previous store. - 2. A load may not pass a previous store unless flag 'NoAlias' is set. - 3. A load may pass a previous load. - 4. A store may not pass a previous load (regardless of flag 'NoAlias'). - 5. A load has to wait until an older load barrier is fully executed. - 6. A store has to wait until an older store barrier is fully executed. - -Known limitations ------------------ -Previous sections described cases where the tool is missing information to give -an accurate report. For example, the first sections of this document explained -how the lack of knowledge about the processor negatively affects the performance -analysis. The lack of knowledge is often a consequence of how scheduling models -are defined; as mentioned before, scheduling models intentionally don't describe -processors in fine details. That being said, the LLVM machine model can be -extended to expose more details, as long as they are opt-in for targets. - -The accuracy of the performance analysis is also affected by assumptions made by -the processor model used by the tool. - -Most recent Intel and AMD processors implement dedicated LoopBuffer/OpCache in -the hardware frontend to speedup the throughput in the presence of tight loops. -The presence of these buffers complicates the decoding logic, and requires -knowledge on the branch predictor too. Class 'SchedMachineModel' in TableGen -provides a field named 'LoopMicroOpBufferSize' which is used to describe loop -buffers. However, the purpose of that field is to enable loop unrolling of -tight loops; essentially, it affects the cost model used by pass loop-unroll. - -At the current state, the tool only describes the out-of-order portion of a -processor, and consequently doesn't try to predict the frontend throughput. That -being said, this tool could be definitely extended in future to also account for -the hardware frontend when doing performance analysis. This would inevitably -require extra (extensive) processor knowledge related to all the available -decoding paths in the hardware frontend, as well as branch prediction. - -Currently, the tool assumes a zero-latency "perfect" fetch&decode -stage; the full sequence of decoded instructions is immediately visible to the -dispatch logic from the start. - -The tool doesn't know about simultaneous mutithreading. According to the tool, -processor resources are not statically/dynamically partitioned. Processor -resources are fully available to the hardware thread executing the -microbenchmark. - -The execution model implemented by this tool assumes that instructions are -firstly dispatched in groups to hardware schedulers, and then issued to -pipelines for execution. The model assumes dynamic scheduling of instructions. -Instructions are placed in a queue and potentially executed out-of-order (based -on the operand availability). The dispatch stage is definitely distinct from the -issue stage. This will change in future; as mentioned in the first section, the -end goal is to let processors customize the process. - -This model doesn't correctly describe processors where the dispatch/issue is a -single stage. This is what happens for example in VLIW processors, where -instructions are packaged and statically scheduled at compile time; it is up to -the compiler to predict the latency of instructions and package issue groups -accordingly. For such targets, there is no dynamic scheduling done by the -hardware. - -Existing classes (DispatchStage, Scheduler, etc.) could be extended/adapted to -support processors with a single dispatch/issue stage. The execution flow would -require some changes in the way how existing components (i.e., DispatchStage, -Scheduler, etc.) interact. This can be a future development. - -The following sections describes other known limitations. The goal is not to -provide an extensive list of limitations; we want to report what we believe are -the most important limitations, and suggest possible methods to overcome them. - -Load/Store barrier instructions and serializing operations ----------------------------------------------------------- -Section "Load/Store Unit and Memory Consistency Model" already mentioned how -LLVM doesn't know about serializing operations and memory barriers. Most of it -boils down to the fact that class MCInstrDesc (intentionally) doesn't expose -those properties. Instead, both serializing operations and memory barriers -"have side-effects" according to MCInstrDesc. That is because, at least for -scheduling purposes, knowing that an instruction has unmodeled side effects is -often enough to treat the instruction like a compiler scheduling barrier. - -A performance analysis tool could use the extra knowledge on barriers and -serializing operations to generate a more accurate performance report. One way -to improve this is by reserving a couple of bits in field 'Flags' from class -MCInstrDesc: one bit for barrier operations, and another bit to mark -instructions as serializing operations. - -Lack of support for instruction itineraries -------------------------------------------- -The current version of the tool doesn't know how to process instruction -itineraries. This is probably one of the most important limitations, since it -affects a few out-of-order processors in LLVM. - -As mentioned in section 'Instruction Issue', class Scheduler delegates to an -instance of class ResourceManager the handling of processor resources. -ResourceManager is where most of the scheduling logic is implemented. - -Adding support for instruction itineraries requires that we teach -ResourceManager how to handle functional units and instruction stages. This -development can be a future extension, and it would probably require a few -changes to the ResourceManager interface. - -Instructions that affect control flow are not correctly modeled ---------------------------------------------------------------- -Examples of instructions that affect the control flow are: return, indirect -branches, calls, etc. The tool doesn't try to predict/evaluate branch targets. -In particular, the tool doesn't model any sort of branch prediction, nor does it -attempt to track changes to the program counter. The tool always assumes that -the input assembly sequence is the body of a microbenchmark (a simple loop -executed for a number of iterations). The "next" instruction in sequence is -always the next instruction to dispatch. - -Call instructions default to an arbitrary high latency of 100cy. A warning is -generated if the tool encounters a call instruction in the sequence. Return -instructions are not evaluated, and therefore control flow is not affected. -However, the tool still queries the processor scheduling model to obtain latency -information for instructions that affect the control flow. - -Known limitations on X86 processors ------------------------------------ - -1) Partial register updates versus full register updates. - -On x86-64, a 32-bit GPR write fully updates the super-register. Example: - add %edi %eax ## eax += edi - -Here, register %eax aliases the lower half of 64-bit register %rax. On x86-64, -register %rax is fully updated by the 'add' (the upper half of %rax is zeroed). -Essentially, it "kills" any previous definition of (the upper half of) register -%rax. - -On the other hand, 8/16 bit register writes only perform a so-called "partial -register update". Example: - add %di, %ax ## ax += di - -Here, register %eax is only partially updated. To be more specific, the lower -half of %eax is set, and the upper half is left unchanged. There is also no -change in the upper 48 bits of register %rax. - -To get accurate performance analysis, the tool has to know which instructions -perform a partial register update, and which instructions fully update the -destination's super-register. - -One way to expose this information is (again) via TableGen. For example, we -could add a flag in the TableGen instruction class to tag instructions that -perform partial register updates. Something like this: 'bit -hasPartialRegisterUpdate = 1'. However, this would force a `let -hasPartialRegisterUpdate = 0` on several instruction definitions. - -Another approach is to have a MCSubtargetInfo hook similar to this: - virtual bool updatesSuperRegisters(unsigned short opcode) { return false; } - -Targets will be able to override this method if needed. Again, this is just an -idea. But the plan is to have this fixed as a future development. - -2) Macro Op fusion. - -The tool doesn't know about macro-op fusion. On modern x86 processors, a -'cmp/test' followed by a 'jmp' is fused into a single macro operation. The -advantage is that the fused pair only consumes a single slot in the dispatch -group. - -As a future development, the tool should be extended to address macro-fusion. -Ideally, we could have LLVM generate a table enumerating all the opcode pairs -that can be fused together. That table could be exposed to the tool via the -MCSubtargetInfo interface. This is just an idea; there may be better ways to -implement this. - -3) Intel processors: mixing legacy SSE with AVX instructions. - -On modern Intel processors with AVX, mixing legacy SSE code with AVX code -negatively impacts the performance. The tool is not aware of this issue, and -the performance penalty is not accounted when doing the analysis. This is -something that we would like to improve in future. - -4) Zero-latency register moves and Zero-idioms. - -Most modern AMD/Intel processors know how to optimize out register-register -moves and zero idioms at register renaming stage. The tool doesn't know -about these patterns, and this may negatively impact the performance analysis. - -Known design problems ---------------------- -This section describes two design issues that are currently affecting the tool. -The long term plan is to "fix" these issues. -Both limitations would be easily fixed if we teach the tool how to directly -manipulate MachineInstr objects (instead of MCInst objects). - -1) Variant instructions not correctly modeled. - -The tool doesn't know how to analyze instructions with a "variant" scheduling -class descriptor. A variant scheduling class needs to be resolved dynamically. -The "actual" scheduling class often depends on the subtarget, as well as -properties of the specific MachineInstr object. - -Unfortunately, the tool manipulates MCInst, and it doesn't know anything about -MachineInstr. As a consequence, the tool cannot use the existing machine -subtarget hooks that are normally used to resolve the variant scheduling class. -This is a major design issue which mostly affects ARM/AArch64 targets. It -mostly boils down to the fact that the existing scheduling framework was meant -to work for MachineInstr. - -When the tool encounters a "variant" instruction, it assumes a generic 1cy -latency. However, the tool would not be able to tell which processor resources -are effectively consumed by the variant instruction. - -2) MCInst and MCInstrDesc. - -Performance analysis tools require data dependency information to correctly -predict the runtime performance of the code. This tool must always be able to -obtain the set of implicit/explicit register defs/uses for every instruction of -the input assembly sequence. - -In the first section of this document, it was mentioned how the tool takes as -input an assembly sequence. That sequence is parsed into a MCInst sequence with -the help of assembly parsers available from the targets. - -A MCInst is a very low-level instruction representation. The tool can inspect -the MCOperand sequence of an MCInst to identify register operands. However, -there is no way to tell register operands that are definitions from register -operands that are uses. - -In LLVM, class MCInstrDesc is used to fully describe target instructions and -their operands. The opcode of a machine instruction (a MachineInstr object) can -be used to query the instruction set through method `MCInstrInfo::get' to obtain -the associated MCInstrDesc object. - -However class MCInstrDesc describes properties and operands of MachineInstr -objects. Essentially, MCInstrDesc is not meant to be used to describe MCInst -objects. To be more specific, MCInstrDesc objects are automatically generated -via TableGen from the instruction set description in the target .td files. For -example, field `MCInstrDesc::NumDefs' is always equal to the cardinality of the -`(outs)` set from the TableGen instruction definition. - -By construction, register definitions always appear at the beginning of the -MachineOperands list in MachineInstr. Basically, the (outs) are the first -operands of a MachineInstr, and the (ins) will come after in the machine operand -list. Knowing the number of register definitions is enough to identify -all the register operands that are definitions. - -In a normal compilation process, MCInst objects are generated from MachineInstr -objects through a lowering step. By default the lowering logic simply iterates -over the machine operands of a MachineInstr, and converts/expands them into -equivalent MCOperand objects. - -The default lowering strategy has the advantage of preserving all of the above -mentioned assumptions on the machine operand sequence. That means, register -definitions would still be at the beginning of the MCOperand sequence, and -register uses would come after. - -Targets may still define custom lowering routines for specific opcodes. Some of -these routines may lower operands in a way that potentially breaks (some of) the -assumptions on the machine operand sequence which were valid for MachineInstr. -Luckily, this is not the most common form of lowering done by the targets, and -the vast majority of the MachineInstr are lowered based on the default strategy -which preserves the original machine operand sequence. This is especially true -for x86, where the custom lowering logic always preserves the original (i.e., -from the MachineInstr) operand sequence. - -This tool currently works under the strong (and potentially incorrect) -assumption that register def/uses in a MCInst can always be identified by -querying the machine instruction descriptor for the opcode. This assumption made -it possible to develop this tool and get good numbers at least for the -processors available in the x86 backend. - -That being said, the analysis is still potentially incorrect for other targets. -So we plan (with the help of the community) to find a proper mechanism to map -when possible MCOperand indices back to MachineOperand indices of the equivalent -MachineInstr. This would be equivalent to describing changes made by the -lowering step which affected the operand sequence. For example, we could have an -index for every register MCOperand (or -1, if the operand didn't exist in the -original MachineInstr). The mapping could look like this <0,1,3,2>. Here, -MCOperand #2 was obtained from the lowering of MachineOperand #3. etc. - -This information could be automatically generated via TableGen for all the -instructions whose custom lowering step breaks assumptions made by the tool on -the register operand sequence (In general, these instructions should be the -minority of a target's instruction set). Unfortunately, we don't have that -information now. As a consequence, we assume that the number of explicit -register definitions is the same number specified in MCInstrDesc. We also -assume that register definitions always come first in the operand sequence. - -In conclusion: these are for now the strong assumptions made by the tool: - * The number of explicit and implicit register definitions in a MCInst - matches the number of explicit and implicit definitions specified by the - MCInstrDesc object. - * Register uses always come after register definitions. - * If an opcode specifies an optional definition, then the optional - definition is always the last register operand in the sequence. - -Note that some of the information accessible from the MCInstrDesc is always -valid for MCInst. For example: implicit register defs, implicit register uses -and 'MayLoad/MayStore/HasUnmodeledSideEffects' opcode properties still apply to -MCInst. The tool knows about this, and uses that information during its -analysis. - -Future work ------------ - * Address limitations (described in section "Known limitations"). - * Let processors specify the selection strategy for processor resource groups - and resources with multiple units. The tool currently uses a round-robin - selector to pick the next resource to use. - * Address limitations specifically described in section "Known limitations on - X86 processors". - * Address design issues identified in section "Known design problems". - * Define a standard interface for "Views". This would let users customize the - performance report generated by the tool. - -When interfaces are mature/stable: - * Move the logic into a library. This will enable a number of other - interesting use cases. - -Work is currently tracked on https://bugs.llvm.org. llvm-mca bugs are tagged -with prefix [llvm-mca]. You can easily find the full list of open bugs if you -search for that tag. diff --git a/tools/llvm-mca/RetireControlUnit.h b/tools/llvm-mca/RetireControlUnit.h index 3530ff21ba0d..8acc8bcc98fe 100644 --- a/tools/llvm-mca/RetireControlUnit.h +++ b/tools/llvm-mca/RetireControlUnit.h @@ -31,7 +31,7 @@ namespace mca { /// this RetireControlUnit (RCU) gets notified. /// /// On instruction retired, register updates are all architecturally -/// committed, and any temporary registers originally allocated for the +/// committed, and any physicall registers previously allocated for the /// retired instruction are freed. struct RetireControlUnit : public HardwareUnit { // A RUToken is created by the RCU for every instruction dispatched to the diff --git a/tools/llvm-mca/RetireStage.cpp b/tools/llvm-mca/RetireStage.cpp index 386ec54d7ba3..55c3b887e478 100644 --- a/tools/llvm-mca/RetireStage.cpp +++ b/tools/llvm-mca/RetireStage.cpp @@ -45,10 +45,12 @@ void RetireStage::cycleStart() { void RetireStage::notifyInstructionRetired(const InstRef &IR) { LLVM_DEBUG(dbgs() << "[E] Instruction Retired: #" << IR << '\n'); SmallVector<unsigned, 4> FreedRegs(PRF.getNumRegisterFiles()); - const InstrDesc &Desc = IR.getInstruction()->getDesc(); + const Instruction &Inst = *IR.getInstruction(); + const InstrDesc &Desc = Inst.getDesc(); - for (const std::unique_ptr<WriteState> &WS : IR.getInstruction()->getDefs()) - PRF.removeRegisterWrite(*WS.get(), FreedRegs, !Desc.isZeroLatency()); + bool ShouldFreeRegs = !(Desc.isZeroLatency() && Inst.isDependencyBreaking()); + for (const std::unique_ptr<WriteState> &WS : Inst.getDefs()) + PRF.removeRegisterWrite(*WS.get(), FreedRegs, ShouldFreeRegs); notifyEvent<HWInstructionEvent>(HWInstructionRetiredEvent(IR, FreedRegs)); } diff --git a/tools/llvm-mca/llvm-mca.cpp b/tools/llvm-mca/llvm-mca.cpp index 2d292f375e6e..897ff232a36d 100644 --- a/tools/llvm-mca/llvm-mca.cpp +++ b/tools/llvm-mca/llvm-mca.cpp @@ -96,7 +96,7 @@ static cl::opt<unsigned> static cl::opt<unsigned> RegisterFileSize("register-file-size", - cl::desc("Maximum number of temporary registers which can " + cl::desc("Maximum number of physical registers which can " "be used for register mappings"), cl::cat(ToolOptions), cl::init(0)); |