8 files changed, 30 insertions, 878 deletions
diff --git a/tools/llvm-mca/DispatchStage.cpp b/tools/llvm-mca/DispatchStage.cpp
index be6f1f89be5c..1f508886c298 100644
--- a/tools/llvm-mca/DispatchStage.cpp
+++ b/tools/llvm-mca/DispatchStage.cpp
@@ -107,17 +107,21 @@ void DispatchStage::dispatch(InstRef IR) {
   // instruction. A dependency-breaking instruction is a zero-latency
   // instruction that doesn't consume hardware resources.
   // An example of dependency-breaking instruction on X86 is a zero-idiom XOR.
-  if (!Desc.isZeroLatency())
-    for (std::unique_ptr<ReadState> &RS : IS.getUses())
+  bool IsDependencyBreaking = IS.isDependencyBreaking();
+  for (std::unique_ptr<ReadState> &RS : IS.getUses())
+    if (RS->isImplicitRead() || !IsDependencyBreaking)
       updateRAWDependencies(*RS, STI);
 
   // By default, a dependency-breaking zero-latency instruction is expected to
   // be optimized at register renaming stage. That means, no physical register
   // is allocated to the instruction.
+  bool ShouldAllocateRegisters =
+      !(Desc.isZeroLatency() && IsDependencyBreaking);
   SmallVector<unsigned, 4> RegisterFiles(PRF.getNumRegisterFiles());
-  for (std::unique_ptr<WriteState> &WS : IS.getDefs())
+  for (std::unique_ptr<WriteState> &WS : IS.getDefs()) {
     PRF.addRegisterWrite(WriteRef(IR.first, WS.get()), RegisterFiles,
-                         !Desc.isZeroLatency());
+                         ShouldAllocateRegisters);
+  }
 
   // Reserve slots in the RCU, and notify the instruction that it has been
   // dispatched to the schedulers for execution.
diff --git a/tools/llvm-mca/DispatchStage.h b/tools/llvm-mca/DispatchStage.h
index f21789a29c50..4262a241c08c 100644
--- a/tools/llvm-mca/DispatchStage.h
+++ b/tools/llvm-mca/DispatchStage.h
@@ -38,7 +38,7 @@ class Scheduler;
 // the following conditions are met:
 //  1) There are enough entries in the reorder buffer (see class
 //     RetireControlUnit) to write the opcodes associated with the instruction.
-//  2) There are enough temporaries to rename output register operands.
+//  2) There are enough physical registers to rename output register operands.
 //  3) There are enough entries available in the used buffered resource(s).
 //
 // The number of micro opcodes that can be dispatched in one cycle is limited by
diff --git a/tools/llvm-mca/InstrBuilder.cpp b/tools/llvm-mca/InstrBuilder.cpp
index dbd457196f9d..053b7b4e8175 100644
--- a/tools/llvm-mca/InstrBuilder.cpp
+++ b/tools/llvm-mca/InstrBuilder.cpp
@@ -443,6 +443,10 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
   // register writes implicitly clear the upper portion of a super-register.
   MCIA.clearsSuperRegisters(MRI, MCI, WriteMask);
 
+  // Check if this is a dependency breaking instruction.
+  if (MCIA.isDependencyBreaking(STI, MCI))
+    NewIS->setDependencyBreaking();
+
   // Initialize writes.
   unsigned WriteIndex = 0;
   for (const WriteDescriptor &WD : D.Writes) {
diff --git a/tools/llvm-mca/Instruction.h b/tools/llvm-mca/Instruction.h
index ddf5c3a5e33f..3b2f90528f2e 100644
--- a/tools/llvm-mca/Instruction.h
+++ b/tools/llvm-mca/Instruction.h
@@ -170,8 +170,6 @@ class ReadState {
   bool IsReady;
 
 public:
-  bool isReady() const { return IsReady; }
-
   ReadState(const ReadDescriptor &Desc, unsigned RegID)
       : RD(Desc), RegisterID(RegID), DependentWrites(0),
         CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true) {}
@@ -182,6 +180,9 @@ public:
   unsigned getSchedClass() const { return RD.SchedClassID; }
   unsigned getRegisterID() const { return RegisterID; }
 
+  bool isReady() const { return IsReady; }
+  bool isImplicitRead() const { return RD.isImplicitRead(); }
+
   void cycleEvent();
   void writeStartEvent(unsigned Cycles);
   void setDependentWrites(unsigned Writes) {
@@ -299,6 +300,8 @@ class Instruction {
   // Retire Unit token ID for this instruction.
   unsigned RCUTokenID;
 
+  bool IsDepBreaking;
+
   using UniqueDef = std::unique_ptr<WriteState>;
   using UniqueUse = std::unique_ptr<ReadState>;
   using VecDefs = std::vector<UniqueDef>;
@@ -314,7 +317,8 @@ class Instruction {
 
 public:
   Instruction(const InstrDesc &D)
-      : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES) {}
+      : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0),
+        IsDepBreaking(false) {}
   Instruction(const Instruction &Other) = delete;
   Instruction &operator=(const Instruction &Other) = delete;
 
@@ -326,6 +330,9 @@ public:
   unsigned getRCUTokenID() const { return RCUTokenID; }
   int getCyclesLeft() const { return CyclesLeft; }
 
+  bool isDependencyBreaking() const { return IsDepBreaking; }
+  void setDependencyBreaking() { IsDepBreaking = true; }
+
   unsigned getNumUsers() const {
     unsigned NumUsers = 0;
     for (const UniqueDef &Def : Defs)
diff --git a/tools/llvm-mca/README.txt b/tools/llvm-mca/README.txt
deleted file mode 100644
index 8b1670db0fca..000000000000
--- a/tools/llvm-mca/README.txt
+++ /dev/null
@@ -1,865 +0,0 @@
-llvm-mca - LLVM Machine Code Analyzer
--------------------------------------
-
-llvm-mca is a performance analysis tool that uses information which is already
-available in LLVM (e.g., scheduling models) to statically measure the
-performance of machine code in a specific cpu.
-
-Performance is measured in terms of throughput as well as processor resource
-consumption.  The tool currently works for processors with an out-of-order
-backend, for which there is a scheduling model available in LLVM.
-
-The main goal of this tool is not just to predict the performance of the code
-when run on the target, but also help with diagnosing potential performance
-issues.
-
-Given an assembly code sequence, llvm-mca estimates the IPC (instructions Per
-cycle), as well as hardware resources pressure. The analysis and reporting style
-were inspired by the IACA tool from Intel.
-
-The presence of long data dependency chains, as well as poor usage of hardware
-resources may lead to bottlenecks in the backend.  The tool is able to generate
-a detailed report which should help with identifying and analyzing sources of
-bottlenecks.
-
-Scheduling models are mostly used to compute instruction latencies, to obtain
-read-advance information, and understand how processor resources are used by
-instructions.  By design, the quality of the performance analysis conducted by
-the tool is inevitably affected by the quality of the target scheduling models.
-However, scheduling models intentionally do not describe all processor details,
-since the goal is just to enable the scheduling of machine instructions during
-compilation. That means, there are processor details which are not important for
-the purpose of scheduling instructions (and therefore not described by the
-scheduling model), but are very important for this tool.
-
-A few examples of details that are missing in scheduling models are:
- - Actual dispatch width (it often differs from the issue width).
- - Number of read/write ports in the register file(s).
- - Length of the load/store queue in the LSUnit.
-
-It is also very difficult to find a "good" abstract model to describe the
-behavior of out-of-order processors. So, we have to keep in mind that all of
-these aspects are going to affect the quality of the static analysis performed
-by the tool.
-
-An extensive list of known limitations is reported in one of the last sections
-of this document. There is also a section related to design problems which must
-be addressed (hopefully with the help of the community).  At the moment, the
-tool has been mostly tested for x86 targets, but there are still several
-limitations, some of which could be overcome by integrating extra information
-into the scheduling models.
-
-How the tool works
-------------------
-
-The tool takes assembly code as input. Assembly code is parsed into a sequence
-of MCInst with the help of the existing LLVM target assembly parsers. The parsed
-sequence of MCInst is then analyzed by a 'Pipeline' module to generate a
-performance report.
-
-The Pipeline module internally emulates the execution of the machine code
-sequence in a loop of iterations (which by default is 100). At the end of this
-process, the pipeline collects a number of statistics which are then printed out
-in the form of a report.
-
-Here is an example of performance report generated by the tool for a dot-product
-of two packed float vectors of four elements. The analysis is conducted for
-target x86, cpu btver2:
-
-///////////////////
-
-Iterations:     300
-Instructions:   900
-Total Cycles:   610
-Dispatch Width: 2
-IPC:            1.48
-
-
-Resources:
-[0] - JALU0
-[1] - JALU1
-[2] - JDiv
-[3] - JFPM
-[4] - JFPU0
-[5] - JFPU1
-[6] - JLAGU
-[7] - JSAGU
-[8] - JSTC
-[9] - JVIMUL
-
-
-Resource pressure per iteration:
-[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
- -      -      -      -     2.00   1.00    -      -      -      -
-
-Resource pressure by instruction:
-[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    	Instructions:
- -      -      -      -      -     1.00    -      -      -      -     	vmulps	%xmm0, %xmm1, %xmm2
- -      -      -      -     1.00    -      -      -      -      -     	vhaddps	%xmm2, %xmm2, %xmm3
- -      -      -      -     1.00    -      -      -      -      -     	vhaddps	%xmm3, %xmm3, %xmm4
-
-
-Instruction Info:
-[1]: #uOps
-[2]: Latency
-[3]: RThroughput
-[4]: MayLoad
-[5]: MayStore
-[6]: HasSideEffects
-
-[1]    [2]    [3]    [4]    [5]    [6]	Instructions:
- 1      2     1.00                    	vmulps	%xmm0, %xmm1, %xmm2
- 1      3     1.00                    	vhaddps	%xmm2, %xmm2, %xmm3
- 1      3     1.00                    	vhaddps	%xmm3, %xmm3, %xmm4
-
-///////////////////
-
-According to this report, the dot-product kernel has been executed 300 times,
-for a total of 900 instructions dynamically executed.
-
-The report is structured in three main sections.  A first section collects a few
-performance numbers; the goal of this section is to give a very quick overview
-of the performance throughput. In this example, the two important performance
-indicators are a) the predicted total number of cycles, and b) the IPC.
-IPC is probably the most important throughput indicator. A big delta between the
-Dispatch Width and the computed IPC is an indicator of potential performance
-issues.
-
-The second section is the so-called "resource pressure view".  This view reports
-the average number of resource cycles consumed every iteration by instructions
-for every processor resource unit available on the target.  Information is
-structured in two tables. The first table reports the number of resource cycles
-spent on average every iteration. The second table correlates the resource
-cycles to the machine instruction in the sequence. For example, every iteration
-of the dot-product, instruction 'vmulps' always executes on resource unit [5]
-(JFPU1 - floating point pipeline #1), consuming an average of 1 resource cycle
-per iteration.  Note that on Jaguar, vector FP multiply can only be issued to
-pipeline JFPU1, while horizontal FP adds can only be issued to pipeline JFPU0.
-
-The third (and last) section of the report shows the latency and reciprocal
-throughput of every instruction in the sequence. That section also reports extra
-information related to the number of micro opcodes, and opcode properties (i.e.,
-'MayLoad', 'MayStore', and 'UnmodeledSideEffects').
-
-The resource pressure view helps with identifying bottlenecks caused by high
-usage of specific hardware resources.  Situations with resource pressure mainly
-concentrated on a few resources should, in general, be avoided.  Ideally,
-pressure should be uniformly distributed between multiple resources.
-
-Timeline View
--------------
-
-A detailed report of each instruction's state transitions over time can be
-enabled using the command line flag '-timeline'.  This prints an extra section
-in the report which contains the so-called "timeline view".  Below is the
-timeline view for the dot-product example from the previous section.
-
-///////////////
-Timeline view:
-     	          012345
-Index	0123456789
-
-[0,0]	DeeER.    .    .	vmulps	%xmm0, %xmm1, %xmm2
-[0,1]	D==eeeER  .    .	vhaddps	%xmm2, %xmm2, %xmm3
-[0,2]	.D====eeeER    .	vhaddps	%xmm3, %xmm3, %xmm4
-
-[1,0]	.DeeE-----R    .	vmulps	%xmm0, %xmm1, %xmm2
-[1,1]	. D=eeeE---R   .	vhaddps	%xmm2, %xmm2, %xmm3
-[1,2]	. D====eeeER   .	vhaddps	%xmm3, %xmm3, %xmm4
-
-[2,0]	.  DeeE-----R  .	vmulps	%xmm0, %xmm1, %xmm2
-[2,1]	.  D====eeeER  .	vhaddps	%xmm2, %xmm2, %xmm3
-[2,2]	.   D======eeeER	vhaddps	%xmm3, %xmm3, %xmm4
-
-
-Average Wait times (based on the timeline view):
-[0]: Executions
-[1]: Average time spent waiting in a scheduler's queue
-[2]: Average time spent waiting in a scheduler's queue while ready
-[3]: Average time elapsed from WB until retire stage
-
-      [0]    [1]    [2]    [3]
-0.     3     1.0    1.0    3.3  	vmulps	%xmm0, %xmm1, %xmm2
-1.     3     3.3    0.7    1.0  	vhaddps	%xmm2, %xmm2, %xmm3
-2.     3     5.7    0.0    0.0  	vhaddps	%xmm3, %xmm3, %xmm4
-///////////////
-
-The timeline view is very interesting because it shows how instructions changed
-in state during execution.  It also gives an idea of how the tool "sees"
-instructions executed on the target.
-
-The timeline view is structured in two tables.  The first table shows how
-instructions change in state over time (measured in cycles); the second table
-(named "Average Wait times") reports useful timing statistics which should help
-diagnose performance bottlenecks caused by long data dependencies and
-sub-optimal usage of hardware resources.
-
-An instruction in the timeline view is identified by a pair of indices, where
-the 'first' index identifies an iteration, and the 'second' index is the actual
-instruction index (i.e., where it appears in the code sequence).
-
-Excluding the first and last column, the remaining columns are in cycles.
-Cycles are numbered sequentially starting from 0.  The following characters are
-used to describe the state of an instruction:
-
- D : Instruction dispatched.
- e : Instruction executing.
- E : Instruction executed.
- R : Instruction retired.
- = : Instruction already dispatched, waiting to be executed.
- - : Instruction executed, waiting to be retired.
-
-Based on the timeline view from the example, we know that:
-  - Instruction [1, 0] was dispatched at cycle 1.
-  - Instruction [1, 0] started executing at cycle 2.
-  - Instruction [1, 0] reached the write back stage at cycle 4.
-  - Instruction [1, 0] was retired at cycle 10.
-
-Instruction [1, 0] (i.e., the vmulps from iteration #1) doesn't have to wait in
-the scheduler's queue for the operands to become available. By the time the
-vmulps is dispatched, operands are already available, and pipeline JFPU1 is
-ready to serve another instruction.  So the instruction can be immediately
-issued on the JFPU1 pipeline. That is demonstrated by the fact that the
-instruction only spent 1cy in the scheduler's queue.
-
-There is a gap of 5 cycles between the write-back stage and the retire event.
-That is because instructions must retire in program order, so [1,0] has to wait
-for [0, 2] to be retired first (i.e., it has to wait until cycle 10).
-
-In the dot-product example, all instructions are in a RAW (Read After Write)
-dependency chain.  Register %xmm2 written by the vmulps is immediately used by
-the first vhaddps, and register %xmm3 written by the first vhaddps is used by
-the second vhaddps.  Long data dependencies negatively affect the ILP
-(Instruction Level Parallelism).
-
-In the dot-product example, there are anti-dependencies introduced by
-instructions from different iterations.  However, those dependencies can be
-removed at register renaming stage (at the cost of allocating register aliases,
-and therefore consuming temporary registers).
-
-Table "Average Wait times" helps diagnose performance issues that are caused by
-the presence of long latency instructions and potentially long data dependencies
-which may limit the ILP.  Note that the tool by default assumes at least 1cy
-between the dispatch event and the issue event.
-
-When the performance is limited by data dependencies and/or long latency
-instructions, the number of cycles spent while in the "ready" state is expected
-to be very small when compared with the total number of cycles spent in the
-scheduler's queue.  So the difference between the two counters is a good
-indicator of how big of an impact data dependencies had on the execution of
-instructions.  When performance is mostly limited by the lack of hardware
-resources, the delta between the two counters is small.  However, the number of
-cycles spent in the queue tends to be bigger (i.e., more than 1-3cy) especially
-when compared with other low latency instructions.
-
-Extra statistics to further diagnose performance issues.
---------------------------------------------------------
-
-Flag '-verbose' enables extra statistics and performance counters for the
-dispatch logic, the reorder buffer, the retire control unit and the register
-file.
-
-Below is an example of verbose output generated by the tool for the dot-product
-example discussed in the previous sections.
-
-///////////////////
-Iterations:     300
-Instructions:   900
-Total Cycles:   610
-Dispatch Width: 2
-IPC:            1.48
-
-
-Dynamic Dispatch Stall Cycles:
-RAT     - Register unavailable:                      0
-RCU     - Retire tokens unavailable:                 0
-SCHEDQ  - Scheduler full:                            272
-LQ      - Load queue full:                           0
-SQ      - Store queue full:                          0
-GROUP   - Static restrictions on the dispatch group: 0
-
-
-Register Alias Table:
-Total number of mappings created: 900
-Max number of mappings used:      35
-
-
-Dispatch Logic - number of cycles where we saw N instructions dispatched:
-[# dispatched], [# cycles]
- 0,              24  (3.9%)
- 1,              272  (44.6%)
- 2,              314  (51.5%)
-
-
-Schedulers - number of cycles where we saw N instructions issued:
-[# issued], [# cycles]
- 0,          7  (1.1%)
- 1,          306  (50.2%)
- 2,          297  (48.7%)
-
-
-Retire Control Unit - number of cycles where we saw N instructions retired:
-[# retired], [# cycles]
- 0,           109  (17.9%)
- 1,           102  (16.7%)
- 2,           399  (65.4%)
-
-
-Scheduler's queue usage:
-JALU01,  0/20
-JFPU01,  18/18
-JLSAGU,  0/12
-///////////////////
-
-Based on the verbose report, the pipeline was only able to dispatch two
-instructions 51.5% of the time.  The dispatch group was limited to one
-instruction 44.6% of the cycles, which corresponds to 272 cycles.
-
-If we look at section "Dynamic Dispatch Stall Cycles", we can see how counter
-SCHEDQ reports 272 cycles.  Counter SCHEDQ is incremented every time the
-dispatch logic is unable to dispatch a full group of two instructions because
-the scheduler's queue is full.
-
-Section "Scheduler's queue usage" shows how the maximum number of buffer entries
-(i.e., scheduler's queue entries) used at runtime for resource JFPU01 reached
-its maximum. Note that AMD Jaguar implements three schedulers:
-  * JALU01 - A scheduler for ALU instructions
-  * JLSAGU - A scheduler for address generation
-  * JFPU01 - A scheduler floating point operations.
-
-The dot-product is a kernel of three floating point instructions (a vector
-multiply followed by two horizontal adds).  That explains why only the floating
-point scheduler appears to be used according to section "Scheduler's queue
-usage".
-
-A full scheduler's queue is either caused by data dependency chains, or by a
-sub-optimal usage of hardware resources.  Sometimes, resource pressure can be
-mitigated by rewriting the kernel using different instructions that consume
-different scheduler resources.  Schedulers with a small queue are less resilient
-to bottlenecks caused by the presence of long data dependencies.
-
-In this example, we can conclude that the IPC is mostly limited by data
-dependencies, and not by resource pressure.
-
-LLVM-MCA instruction flow
--------------------------
-
-This section describes the instruction flow through the out-of-order backend,
-as well as the functional units involved in the process.
-
-An instruction goes through a default sequence of stages:
-    - Dispatch (Instruction is dispatched to the schedulers).
-    - Issue (Instruction is issued to the processor pipelines).
-    - Write Back (Instruction is executed, and results are written back).
-    - Retire (Instruction is retired; writes are architecturally committed).
-
-The tool only models the out-of-order portion of a processor. Therefore, the
-instruction fetch and decode stages are not modeled. Performance bottlenecks in
-the frontend are not diagnosed by this tool.  The tool assumes that instructions
-have all been decoded and placed in a queue.  Also, the tool doesn't know
-anything about branch prediction.
-
-The long term plan is to make the process customizable, so that processors can
-define their own. This is a future work.
-
-Instruction Dispatch
---------------------
-
-During the Dispatch stage, instructions are picked in program order from a queue
-of already decoded instructions, and dispatched in groups to the hardware
-schedulers.  The dispatch logic is implemented by class DispatchStage in file
-DispatchStage.h.
-
-The size of a dispatch group depends on the availability of hardware resources,
-and it cannot exceed the value of field 'DispatchWidth' in class DispatchStage.
-Note that field DispatchWidth defaults to the value of field 'IssueWidth' from
-the scheduling model.
-
-Users can override the DispatchWidth value with flag "-dispatch=<N>" (where 'N'
-is an unsigned quantity).
-
-An instruction can be dispatched if:
- - The size of the dispatch group is smaller than DispatchWidth
- - There are enough entries in the reorder buffer
- - There are enough temporary registers to do register renaming
- - Schedulers are not full.
-
-Since r329067, scheduling models can now optionally specify which register
-files are available on the processor. Class DispatchStage(see DispatchStage.h)
-would use that information to initialize register file descriptors.
-
-By default, if the model doesn't describe register files, the tool
-(optimistically) assumes a single register file with an unbounded number of
-temporary registers.  Users can limit the number of temporary registers that
-are globally available for register renaming using flag
-`-register-file-size=<N>`, where N is the number of temporaries.  A value of
-zero for N means 'unbounded'.  Knowing how many temporaries are available for
-register renaming, the tool can predict dispatch stalls caused by the lack of
-temporaries.
-
-The number of reorder buffer entries consumed by an instruction depends on the
-number of micro-opcodes it specifies in the target scheduling model (see field
-'NumMicroOpcodes' of TableGen class ProcWriteResources and its derived classes;
-TargetSchedule.td).
-
-The reorder buffer is implemented by class RetireControlUnit (see
-DispatchStage.h).  Its goal is to track the progress of instructions that are
-"in-flight", and retire instructions in program order.  The number of entries
-in the reorder buffer defaults to the value of field 'MicroOpBufferSize' from
-the target scheduling model.
-
-Instructions that are dispatched to the schedulers consume scheduler buffer
-entries.  The tool queries the scheduling model to figure out the set of
-buffered resources consumed by an instruction.  Buffered resources are treated
-like "scheduler" resources, and the field 'BufferSize' (from the processor
-resource TableGen definition) defines the size of the scheduler's queue.
-
-Zero latency instructions (for example NOP instructions) don't consume scheduler
-resources.  However, those instructions still reserve a number of slots in the
-reorder buffer.
-
-Instruction Issue
------------------
-
-As mentioned in the previous section, each scheduler resource implements a queue
-of instructions.  An instruction has to wait in the scheduler's queue until
-input register operands become available.  Only at that point, does the
-instruction becomes eligible for execution and may be issued (potentially
-out-of-order) to a pipeline for execution.
-
-Instruction latencies can be computed by the tool with the help of the
-scheduling model; latency values are defined by the scheduling model through
-ProcWriteResources objects.
-
-Class Scheduler (see file Scheduler.h) knows how to emulate multiple processor
-schedulers.  A Scheduler is responsible for tracking data dependencies, and
-dynamically select which processor resources are consumed/used by instructions.
-
-Internally, the Scheduler class delegates the management of processor resource
-units and resource groups to the ResourceManager class.  ResourceManager is also
-responsible for selecting resource units that are effectively consumed by
-instructions.  For example, if an instruction consumes 1cy of a resource group,
-the ResourceManager object selects one of the available units from the group; by
-default, it uses a round-robin selector to guarantee that resource usage is
-uniformly distributed between all units of a group.
-
-Internally, class Scheduler implements three instruction queues:
-  - WaitQueue: a queue of instructions whose operands are not ready yet.
-  - ReadyQueue: a queue of instructions ready to execute.
-  - IssuedQueue: a queue of instructions executing.
-
-Depending on the operands availability, instructions that are dispatched to the
-Scheduler are either placed into the WaitQueue or into the ReadyQueue.
-
-Every cycle, class Scheduler checks if instructions can be moved from the
-WaitQueue to the ReadyQueue, and if instructions from the ReadyQueue can be
-issued to the underlying pipelines.  The algorithm prioritizes older
-instructions over younger instructions.
-
-Objects of class ResourceState (see Scheduler.h) describe processor resources.
-There is an instance of class ResourceState for each single processor resource
-specified by the scheduling model.  A ResourceState object for a processor
-resource with multiple units dynamically tracks the availability of every single
-unit.  For example, the ResourceState of a resource group tracks the
-availability of every resource in that group.  Internally, ResourceState
-implements a round-robin selector to dynamically pick the next unit to use from
-the group.
-
-Write-Back and Retire Stage
----------------------------
-
-Issued instructions are moved from the ReadyQueue to the IssuedQueue.  There,
-instructions wait until they reach the write-back stage.  At that point, they
-get removed from the queue and the retire control unit is notified.
-
-On the event of "instruction executed", the retire control unit flags the
-instruction as "ready to retire".
-
-Instruction are retired in program order; an "instruction retired" event is sent
-to the register file which frees the temporary registers allocated for the
-instruction at register renaming stage.
-
-Load/Store Unit and Memory Consistency Model
---------------------------------------------
-
-The tool attempts to emulate out-of-order execution of memory operations.  Class
-LSUnit (see file LSUnit.h) emulates a load/store unit implementing queues for
-speculative execution of loads and stores.
-
-Each load (or store) consumes an entry in the load (or store) queue.  The number
-of slots in the load/store queues is unknown by the tool, since there is no
-mention of it in the scheduling model.  In practice, users can specify flag
-`-lqueue=N` (vic. `-squeue=N`) to limit the number of entries in the queue to be
-equal to exactly N (an unsigned value). If N is zero, then the tool assumes an
-unbounded queue (this is the default).
-
-LSUnit implements a relaxed consistency model for memory loads and stores. The
-rules are:
-1) A younger load is allowed to pass an older load only if there is no
-   intervening store in between the two loads.
-2) An younger store is not allowed to pass an older store.
-3) A younger store is not allowed to pass an older load.
-4) A younger load is allowed to pass an older store provided that the load does
-   not alias with the store.
-
-By default, this class conservatively (i.e., pessimistically) assumes that loads
-always may-alias store operations.  Essentially, this LSUnit doesn't perform
-any sort of alias analysis to rule out cases where loads and stores don't
-overlap with each other.  The downside of this approach however is that younger
-loads are never allowed to pass older stores.  To make it possible for a
-younger load to pass an older store, users can use the command line flag
--noalias.  Under 'noalias', a younger load is always allowed to pass an older
-store.
-
-Note that, in the case of write-combining memory, rule 2. could be relaxed a bit
-to allow reordering of non-aliasing store operations.  That being said, at the
-moment, there is no way to further relax the memory model (flag -noalias is the
-only option).  Essentially, there is no option to specify a different memory
-type (for example: write-back, write-combining, write-through; etc.) and
-consequently to weaken or strengthen the memory model.
-
-Other limitations are:
- * LSUnit doesn't know when store-to-load forwarding may occur.
- * LSUnit doesn't know anything about the cache hierarchy and memory types.
- * LSUnit doesn't know how to identify serializing operations and memory fences.
-
-No assumption is made on the store buffer size.  As mentioned before, LSUnit
-conservatively assumes a may-alias relation between loads and stores, and it
-doesn't attempt to identify cases where store-to-load forwarding would occur in
-practice.
-
-LSUnit doesn't attempt to predict whether a load or store hits or misses the L1
-cache.  It only knows if an instruction "MayLoad" and/or "MayStore".  For loads,
-the scheduling model provides an "optimistic" load-to-use latency (which usually
-matches the load-to-use latency for when there is a hit in the L1D).
-
-Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
-memory-barrier like instructions.  LSUnit conservatively assumes that an
-instruction which has both 'MayLoad' and 'UnmodeledSideEffects' behaves like a
-"soft" load-barrier.  That means, it serializes loads without forcing a flush of
-the load queue.  Similarly, instructions flagged with both 'MayStore' and
-'UnmodeledSideEffects' are treated like store barriers.  A full memory barrier
-is a 'MayLoad' and 'MayStore' instruction with 'UnmodeledSideEffects'.  This is
-inaccurate, but it is the best that we can do at the moment with the current
-information available in LLVM.
-
-A load/store barrier consumes one entry of the load/store queue.  A load/store
-barrier enforces ordering of loads/stores.  A younger load cannot pass a load
-barrier.  Also, a younger store cannot pass a store barrier.  A younger load has
-to wait for the memory/load barrier to execute.  A load/store barrier is
-"executed" when it becomes the oldest entry in the load/store queue(s). That
-also means, by construction, all the older loads/stores have been executed.
-
-In conclusion the full set of rules is:
-  1. A store may not pass a previous store.
-  2. A load may not pass a previous store unless flag 'NoAlias' is set.
-  3. A load may pass a previous load.
-  4. A store may not pass a previous load (regardless of flag 'NoAlias').
-  5. A load has to wait until an older load barrier is fully executed.
-  6. A store has to wait until an older store barrier is fully executed.
-
-Known limitations
------------------
-Previous sections described cases where the tool is missing information to give
-an accurate report.  For example, the first sections of this document explained
-how the lack of knowledge about the processor negatively affects the performance
-analysis.  The lack of knowledge is often a consequence of how scheduling models
-are defined; as mentioned before, scheduling models intentionally don't describe
-processors in fine details. That being said, the LLVM machine model can be
-extended to expose more details, as long as they are opt-in for targets.
-
-The accuracy of the performance analysis is also affected by assumptions made by
-the processor model used by the tool.
-
-Most recent Intel and AMD processors implement dedicated LoopBuffer/OpCache in
-the hardware frontend to speedup the throughput in the presence of tight loops.
-The presence of these buffers complicates the decoding logic, and requires
-knowledge on the branch predictor too.  Class 'SchedMachineModel' in TableGen
-provides a field named 'LoopMicroOpBufferSize' which is used to describe loop
-buffers.  However, the purpose of that field is to enable loop unrolling of
-tight loops; essentially, it affects the cost model used by pass loop-unroll.
-
-At the current state, the tool only describes the out-of-order portion of a
-processor, and consequently doesn't try to predict the frontend throughput. That
-being said, this tool could be definitely extended in future to also account for
-the hardware frontend when doing performance analysis.  This would inevitably
-require extra (extensive) processor knowledge related to all the available
-decoding paths in the hardware frontend, as well as branch prediction.
-
-Currently, the tool assumes a zero-latency "perfect" fetch&decode
-stage; the full sequence of decoded instructions is immediately visible to the
-dispatch logic from the start.
-
-The tool doesn't know about simultaneous mutithreading.  According to the tool,
-processor resources are not statically/dynamically partitioned.  Processor
-resources are fully available to the hardware thread executing the
-microbenchmark.
-
-The execution model implemented by this tool assumes that instructions are
-firstly dispatched in groups to hardware schedulers, and then issued to
-pipelines for execution.  The model assumes dynamic scheduling of instructions.
-Instructions are placed in a queue and potentially executed out-of-order (based
-on the operand availability). The dispatch stage is definitely distinct from the
-issue stage. This will change in future; as mentioned in the first section, the
-end goal is to let processors customize the process.
-
-This model doesn't correctly describe processors where the dispatch/issue is a
-single stage. This is what happens for example in VLIW processors, where
-instructions are packaged and statically scheduled at compile time; it is up to
-the compiler to predict the latency of instructions and package issue groups
-accordingly. For such targets, there is no dynamic scheduling done by the
-hardware.
-
-Existing classes (DispatchStage, Scheduler, etc.) could be extended/adapted to
-support processors with a single dispatch/issue stage. The execution flow would
-require some changes in the way how existing components (i.e.,  DispatchStage,
-Scheduler, etc.) interact. This can be a future development.
-
-The following sections describes other known limitations.  The goal is not to
-provide an extensive list of limitations; we want to report what we believe are
-the most important limitations, and suggest possible methods to overcome them.
-
-Load/Store barrier instructions and serializing operations
-----------------------------------------------------------
-Section "Load/Store Unit and Memory Consistency Model" already mentioned how
-LLVM doesn't know about serializing operations and memory barriers.  Most of it
-boils down to the fact that class MCInstrDesc (intentionally) doesn't expose
-those properties.  Instead, both serializing operations and memory barriers
-"have side-effects" according to MCInstrDesc.  That is because, at least for
-scheduling purposes, knowing that an instruction has unmodeled side effects is
-often enough to treat the instruction like a compiler scheduling barrier.
-
-A performance analysis tool could use the extra knowledge on barriers and
-serializing operations to generate a more accurate performance report.  One way
-to improve this is by reserving a couple of bits in field 'Flags' from class
-MCInstrDesc: one bit for barrier operations, and another bit to mark
-instructions as serializing operations.
-
-Lack of support for instruction itineraries
--------------------------------------------
-The current version of the tool doesn't know how to process instruction
-itineraries.  This is probably one of the most important limitations, since it
-affects a few out-of-order processors in LLVM.
-
-As mentioned in section 'Instruction Issue', class Scheduler delegates to an
-instance of class ResourceManager the handling of processor resources.
-ResourceManager is where most of the scheduling logic is implemented.
-
-Adding support for instruction itineraries requires that we teach
-ResourceManager how to handle functional units and instruction stages.  This
-development can be a future extension, and it would probably require a few
-changes to the ResourceManager interface.
-
-Instructions that affect control flow are not correctly modeled
----------------------------------------------------------------
-Examples of instructions that affect the control flow are: return, indirect
-branches, calls, etc.  The tool doesn't try to predict/evaluate branch targets.
-In particular, the tool doesn't model any sort of branch prediction, nor does it
-attempt to track changes to the program counter.  The tool always assumes that
-the input assembly sequence is the body of a microbenchmark (a simple loop
-executed for a number of iterations). The "next" instruction in sequence is
-always the next instruction to dispatch.
-
-Call instructions default to an arbitrary high latency of 100cy. A warning is
-generated if the tool encounters a call instruction in the sequence.  Return
-instructions are not evaluated, and therefore control flow is not affected.
-However, the tool still queries the processor scheduling model to obtain latency
-information for instructions that affect the control flow.
-
-Known limitations on X86 processors
------------------------------------
-
-1) Partial register updates versus full register updates.
-
-On x86-64, a 32-bit GPR write fully updates the super-register. Example:
-      add %edi %eax    ## eax += edi
-
-Here, register %eax aliases the lower half of 64-bit register %rax. On x86-64,
-register %rax is fully updated by the 'add' (the upper half of %rax is zeroed).
-Essentially, it "kills" any previous definition of (the upper half of) register
-%rax.
-
-On the other hand, 8/16 bit register writes only perform a so-called "partial
-register update". Example:
-      add %di, %ax     ## ax += di
-
-Here, register %eax is only partially updated. To be more specific, the lower
-half of %eax is set, and the upper half is left unchanged. There is also no
-change in the upper 48 bits of register %rax.
-
-To get accurate performance analysis, the tool has to know which instructions
-perform a partial register update, and which instructions fully update the
-destination's super-register.
-
-One way to expose this information is (again) via TableGen.  For example, we
-could add a flag in the TableGen instruction class to tag instructions that
-perform partial register updates. Something like this: 'bit
-hasPartialRegisterUpdate = 1'. However, this would force a `let
-hasPartialRegisterUpdate = 0` on several instruction definitions.
-
-Another approach is to have a MCSubtargetInfo hook similar to this:
-    virtual bool updatesSuperRegisters(unsigned short opcode) { return false; }
-
-Targets will be able to override this method if needed.  Again, this is just an
-idea. But the plan is to have this fixed as a future development.
-
-2) Macro Op fusion.
-
-The tool doesn't know about macro-op fusion. On modern x86 processors, a
-'cmp/test' followed by a 'jmp' is fused into a single macro operation.  The
-advantage is that the fused pair only consumes a single slot in the dispatch
-group.
-
-As a future development, the tool should be extended to address macro-fusion.
-Ideally, we could have LLVM generate a table enumerating all the opcode pairs
-that can be fused together. That table could be exposed to the tool via the
-MCSubtargetInfo interface.  This is just an idea; there may be better ways to
-implement this.
-
-3) Intel processors: mixing legacy SSE with AVX instructions.
-
-On modern Intel processors with AVX, mixing legacy SSE code with AVX code
-negatively impacts the performance.  The tool is not aware of this issue, and
-the performance penalty is not accounted when doing the analysis. This is
-something that we would like to improve in future.
-
-4) Zero-latency register moves and Zero-idioms.
-
-Most modern AMD/Intel processors know how to optimize out register-register
-moves and zero idioms at register renaming stage. The tool doesn't know
-about these patterns, and this may negatively impact the performance analysis.
-
-Known design problems
----------------------
-This section describes two design issues that are currently affecting the tool.
-The long term plan is to "fix" these issues.
-Both limitations would be easily fixed if we teach the tool how to directly
-manipulate MachineInstr objects (instead of MCInst objects).
-
-1) Variant instructions not correctly modeled.
-
-The tool doesn't know how to analyze instructions with a "variant" scheduling
-class descriptor. A variant scheduling class needs to be resolved dynamically.
-The "actual" scheduling class often depends on the subtarget, as well as
-properties of the specific MachineInstr object.
-
-Unfortunately, the tool manipulates MCInst, and it doesn't know anything about
-MachineInstr. As a consequence, the tool cannot use the existing machine
-subtarget hooks that are normally used to resolve the variant scheduling class.
-This is a major design issue which mostly affects ARM/AArch64 targets.  It
-mostly boils down to the fact that the existing scheduling framework was meant
-to work for MachineInstr.
-
-When the tool encounters a "variant" instruction, it assumes a generic 1cy
-latency. However, the tool would not be able to tell which processor resources
-are effectively consumed by the variant instruction.
-
-2) MCInst and MCInstrDesc.
-
-Performance analysis tools require data dependency information to correctly
-predict the runtime performance of the code. This tool must always be able to
-obtain the set of implicit/explicit register defs/uses for every instruction of
-the input assembly sequence.
-
-In the first section of this document, it was mentioned how the tool takes as
-input an assembly sequence. That sequence is parsed into a MCInst sequence with
-the help of assembly parsers available from the targets.
-
-A MCInst is a very low-level instruction representation. The tool can inspect
-the MCOperand sequence of an MCInst to identify register operands. However,
-there is no way to tell register operands that are definitions from register
-operands that are uses.
-
-In LLVM, class MCInstrDesc is used to fully describe target instructions and
-their operands. The opcode of a machine instruction (a MachineInstr object) can
-be used to query the instruction set through method `MCInstrInfo::get' to obtain
-the associated MCInstrDesc object.
-
-However class MCInstrDesc describes properties and operands of MachineInstr
-objects. Essentially, MCInstrDesc is not meant to be used to describe MCInst
-objects.  To be more specific, MCInstrDesc objects are automatically generated
-via TableGen from the instruction set description in the target .td files.  For
-example, field `MCInstrDesc::NumDefs' is always equal to the cardinality of the
-`(outs)` set from the TableGen instruction definition.
-
-By construction, register definitions always appear at the beginning of the
-MachineOperands list in MachineInstr. Basically, the (outs) are the first
-operands of a MachineInstr, and the (ins) will come after in the machine operand
-list. Knowing the number of register definitions is enough to identify
-all the register operands that are definitions.
-
-In a normal compilation process, MCInst objects are generated from MachineInstr
-objects through a lowering step. By default the lowering logic simply iterates
-over the machine operands of a MachineInstr, and converts/expands them into
-equivalent MCOperand objects.
-
-The default lowering strategy has the advantage of preserving all of the above
-mentioned assumptions on the machine operand sequence. That means, register
-definitions would still be at the beginning of the MCOperand sequence, and
-register uses would come after.
-
-Targets may still define custom lowering routines for specific opcodes. Some of
-these routines may lower operands in a way that potentially breaks (some of) the
-assumptions on the machine operand sequence which were valid for MachineInstr.
-Luckily, this is not the most common form of lowering done by the targets, and
-the vast majority of the MachineInstr are lowered based on the default strategy
-which preserves the original machine operand sequence.  This is especially true
-for x86, where the custom lowering logic always preserves the original (i.e.,
-from the MachineInstr) operand sequence.
-
-This tool currently works under the strong (and potentially incorrect)
-assumption that register def/uses in a MCInst can always be identified by
-querying the machine instruction descriptor for the opcode. This assumption made
-it possible to develop this tool and get good numbers at least for the
-processors available in the x86 backend.
-
-That being said, the analysis is still potentially incorrect for other targets.
-So we plan (with the help of the community) to find a proper mechanism to map
-when possible MCOperand indices back to MachineOperand indices of the equivalent
-MachineInstr.  This would be equivalent to describing changes made by the
-lowering step which affected the operand sequence. For example, we could have an
-index for every register MCOperand (or -1, if the operand didn't exist in the
-original MachineInstr). The mapping could look like this <0,1,3,2>.  Here,
-MCOperand #2 was obtained from the lowering of MachineOperand #3. etc.
-
-This information could be automatically generated via TableGen for all the
-instructions whose custom lowering step breaks assumptions made by the tool on
-the register operand sequence (In general, these instructions should be the
-minority of a target's instruction set). Unfortunately, we don't have that
-information now.  As a consequence, we assume that the number of explicit
-register definitions is the same number specified in MCInstrDesc.  We also
-assume that register definitions always come first in the operand sequence.
-
-In conclusion: these are for now the strong assumptions made by the tool:
-  * The number of explicit and implicit register definitions in a MCInst
-    matches the number of explicit and implicit definitions specified by the
-    MCInstrDesc object.
-  * Register uses always come after register definitions.
-  * If an opcode specifies an optional definition, then the optional
-    definition is always the last register operand in the sequence.
-
-Note that some of the information accessible from the MCInstrDesc is always
-valid for MCInst. For example: implicit register defs, implicit register uses
-and 'MayLoad/MayStore/HasUnmodeledSideEffects' opcode properties still apply to
-MCInst. The tool knows about this, and uses that information during its
-analysis.
-
-Future work
------------
- * Address limitations (described in section "Known limitations").
- * Let processors specify the selection strategy for processor resource groups
-   and resources with multiple units. The tool currently uses a round-robin
-   selector to pick the next resource to use.
- * Address limitations specifically described in section "Known limitations on
-   X86 processors".
- * Address design issues identified in section "Known design problems".
- * Define a standard interface for "Views". This would let users customize the
-   performance report generated by the tool.
-
-When interfaces are mature/stable:
- * Move the logic into a library. This will enable a number of other
-   interesting use cases.
-
-Work is currently tracked on https://bugs.llvm.org. llvm-mca bugs are tagged
-with prefix [llvm-mca]. You can easily find the full list of open bugs if you
-search for that tag.
diff --git a/tools/llvm-mca/RetireControlUnit.h b/tools/llvm-mca/RetireControlUnit.h
index 3530ff21ba0d..8acc8bcc98fe 100644
--- a/tools/llvm-mca/RetireControlUnit.h
+++ b/tools/llvm-mca/RetireControlUnit.h
@@ -31,7 +31,7 @@ namespace mca {
 /// this RetireControlUnit (RCU) gets notified.
 ///
 /// On instruction retired, register updates are all architecturally
-/// committed, and any temporary registers originally allocated for the
+/// committed, and any physicall registers previously allocated for the
 /// retired instruction are freed.
 struct RetireControlUnit : public HardwareUnit {
   // A RUToken is created by the RCU for every instruction dispatched to the
diff --git a/tools/llvm-mca/RetireStage.cpp b/tools/llvm-mca/RetireStage.cpp
index 386ec54d7ba3..55c3b887e478 100644
--- a/tools/llvm-mca/RetireStage.cpp
+++ b/tools/llvm-mca/RetireStage.cpp
@@ -45,10 +45,12 @@ void RetireStage::cycleStart() {
 void RetireStage::notifyInstructionRetired(const InstRef &IR) {
   LLVM_DEBUG(dbgs() << "[E] Instruction Retired: #" << IR << '\n');
   SmallVector<unsigned, 4> FreedRegs(PRF.getNumRegisterFiles());
-  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  const Instruction &Inst = *IR.getInstruction();
+  const InstrDesc &Desc = Inst.getDesc();
 
-  for (const std::unique_ptr<WriteState> &WS : IR.getInstruction()->getDefs())
-    PRF.removeRegisterWrite(*WS.get(), FreedRegs, !Desc.isZeroLatency());
+  bool ShouldFreeRegs = !(Desc.isZeroLatency() && Inst.isDependencyBreaking());
+  for (const std::unique_ptr<WriteState> &WS : Inst.getDefs())
+    PRF.removeRegisterWrite(*WS.get(), FreedRegs, ShouldFreeRegs);
   notifyEvent<HWInstructionEvent>(HWInstructionRetiredEvent(IR, FreedRegs));
 }
 
diff --git a/tools/llvm-mca/llvm-mca.cpp b/tools/llvm-mca/llvm-mca.cpp
index 2d292f375e6e..897ff232a36d 100644
--- a/tools/llvm-mca/llvm-mca.cpp
+++ b/tools/llvm-mca/llvm-mca.cpp
@@ -96,7 +96,7 @@ static cl::opt<unsigned>
 
 static cl::opt<unsigned>
     RegisterFileSize("register-file-size",
-                     cl::desc("Maximum number of temporary registers which can "
+                     cl::desc("Maximum number of physical registers which can "
                               "be used for register mappings"),
                      cl::cat(ToolOptions), cl::init(0));