aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
blob: 8258b92a716d2b8ad2b2a5367eb23b5109c87861 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
//===- SpeculateAroundPHIs.cpp --------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"

using namespace llvm;

#define DEBUG_TYPE "spec-phis"

STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around");
STATISTIC(NumEdgesSplit,
          "Number of critical edges which were split for speculation");
STATISTIC(NumSpeculatedInstructions,
          "Number of instructions we speculated around the PHI nodes");
STATISTIC(NumNewRedundantInstructions,
          "Number of new, redundant instructions inserted");

/// Check whether speculating the users of a PHI node around the PHI
/// will be safe.
///
/// This checks both that all of the users are safe and also that all of their
/// operands are either recursively safe or already available along an incoming
/// edge to the PHI.
///
/// This routine caches both all the safe nodes explored in `PotentialSpecSet`
/// and the chain of nodes that definitively reach any unsafe node in
/// `UnsafeSet`. By preserving these between repeated calls to this routine for
/// PHIs in the same basic block, the exploration here can be reused. However,
/// these caches must no be reused for PHIs in a different basic block as they
/// reflect what is available along incoming edges.
static bool
isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
                          SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
                          SmallPtrSetImpl<Instruction *> &UnsafeSet) {
  auto *PhiBB = PN.getParent();
  SmallPtrSet<Instruction *, 4> Visited;
  SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;

  // Walk each user of the PHI node.
  for (Use &U : PN.uses()) {
    auto *UI = cast<Instruction>(U.getUser());

    // Ensure the use post-dominates the PHI node. This ensures that, in the
    // absence of unwinding, the use will actually be reached.
    // FIXME: We use a blunt hammer of requiring them to be in the same basic
    // block. We should consider using actual post-dominance here in the
    // future.
    if (UI->getParent() != PhiBB) {
      LLVM_DEBUG(dbgs() << "  Unsafe: use in a different BB: " << *UI << "\n");
      return false;
    }

    if (const auto *CS = dyn_cast<CallBase>(UI)) {
      if (CS->isConvergent() || CS->cannotDuplicate()) {
        LLVM_DEBUG(dbgs() << "  Unsafe: convergent "
                   "callsite cannot de duplicated: " << *UI << '\n');
        return false;
      }
    }

    // FIXME: This check is much too conservative. We're not going to move these
    // instructions onto new dynamic paths through the program unless there is
    // a call instruction between the use and the PHI node. And memory isn't
    // changing unless there is a store in that same sequence. We should
    // probably change this to do at least a limited scan of the intervening
    // instructions and allow handling stores in easily proven safe cases.
    if (mayBeMemoryDependent(*UI)) {
      LLVM_DEBUG(dbgs() << "  Unsafe: can't speculate use: " << *UI << "\n");
      return false;
    }

    // Now do a depth-first search of everything these users depend on to make
    // sure they are transitively safe. This is a depth-first search, but we
    // check nodes in preorder to minimize the amount of checking.
    Visited.insert(UI);
    DFSStack.push_back({UI, UI->value_op_begin()});
    do {
      User::value_op_iterator OpIt;
      std::tie(UI, OpIt) = DFSStack.pop_back_val();

      while (OpIt != UI->value_op_end()) {
        auto *OpI = dyn_cast<Instruction>(*OpIt);
        // Increment to the next operand for whenever we continue.
        ++OpIt;
        // No need to visit non-instructions, which can't form dependencies.
        if (!OpI)
          continue;

        // Now do the main pre-order checks that this operand is a viable
        // dependency of something we want to speculate.

        // First do a few checks for instructions that won't require
        // speculation at all because they are trivially available on the
        // incoming edge (either through dominance or through an incoming value
        // to a PHI).
        //
        // The cases in the current block will be trivially dominated by the
        // edge.
        auto *ParentBB = OpI->getParent();
        if (ParentBB == PhiBB) {
          if (isa<PHINode>(OpI)) {
            // We can trivially map through phi nodes in the same block.
            continue;
          }
        } else if (DT.dominates(ParentBB, PhiBB)) {
          // Instructions from dominating blocks are already available.
          continue;
        }

        // Once we know that we're considering speculating the operand, check
        // if we've already explored this subgraph and found it to be safe.
        if (PotentialSpecSet.count(OpI))
          continue;

        // If we've already explored this subgraph and found it unsafe, bail.
        // If when we directly test whether this is safe it fails, bail.
        if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
            mayBeMemoryDependent(*OpI)) {
          LLVM_DEBUG(dbgs() << "  Unsafe: can't speculate transitive use: "
                            << *OpI << "\n");
          // Record the stack of instructions which reach this node as unsafe
          // so we prune subsequent searches.
          UnsafeSet.insert(OpI);
          for (auto &StackPair : DFSStack) {
            Instruction *I = StackPair.first;
            UnsafeSet.insert(I);
          }
          return false;
        }

        // Skip any operands we're already recursively checking.
        if (!Visited.insert(OpI).second)
          continue;

        // Push onto the stack and descend. We can directly continue this
        // loop when ascending.
        DFSStack.push_back({UI, OpIt});
        UI = OpI;
        OpIt = OpI->value_op_begin();
      }

      // This node and all its operands are safe. Go ahead and cache that for
      // reuse later.
      PotentialSpecSet.insert(UI);

      // Continue with the next node on the stack.
    } while (!DFSStack.empty());
  }

#ifndef NDEBUG
  // Every visited operand should have been marked as safe for speculation at
  // this point. Verify this and return success.
  for (auto *I : Visited)
    assert(PotentialSpecSet.count(I) &&
           "Failed to mark a visited instruction as safe!");
#endif
  return true;
}

/// Check whether, in isolation, a given PHI node is both safe and profitable
/// to speculate users around.
///
/// This handles checking whether there are any constant operands to a PHI
/// which could represent a useful speculation candidate, whether the users of
/// the PHI are safe to speculate including all their transitive dependencies,
/// and whether after speculation there will be some cost savings (profit) to
/// folding the operands into the users of the PHI node. Returns true if both
/// safe and profitable with relevant cost savings updated in the map and with
/// an update to the `PotentialSpecSet`. Returns false if either safety or
/// profitability are absent. Some new entries may be made to the
/// `PotentialSpecSet` even when this routine returns false, but they remain
/// conservatively correct.
///
/// The profitability check here is a local one, but it checks this in an
/// interesting way. Beyond checking that the total cost of materializing the
/// constants will be less than the cost of folding them into their users, it
/// also checks that no one incoming constant will have a higher cost when
/// folded into its users rather than materialized. This higher cost could
/// result in a dynamic *path* that is more expensive even when the total cost
/// is lower. Currently, all of the interesting cases where this optimization
/// should fire are ones where it is a no-loss operation in this sense. If we
/// ever want to be more aggressive here, we would need to balance the
/// different incoming edges' cost by looking at their respective
/// probabilities.
static bool isSafeAndProfitableToSpeculateAroundPHI(
    PHINode &PN, SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
    SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
    SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT,
    TargetTransformInfo &TTI) {
  // First see whether there is any cost savings to speculating around this
  // PHI, and build up a map of the constant inputs to how many times they
  // occur.
  bool NonFreeMat = false;
  struct CostsAndCount {
    int MatCost = TargetTransformInfo::TCC_Free;
    int FoldedCost = TargetTransformInfo::TCC_Free;
    int Count = 0;
  };
  SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts;
  SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks;
  for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) {
    auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i));
    if (!IncomingC)
      continue;

    // Only visit each incoming edge with a constant input once.
    if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second)
      continue;

    auto InsertResult = CostsAndCounts.insert({IncomingC, {}});
    // Count how many edges share a given incoming costant.
    ++InsertResult.first->second.Count;
    // Only compute the cost the first time we see a particular constant.
    if (!InsertResult.second)
      continue;

    int &MatCost = InsertResult.first->second.MatCost;
    MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType(),
                                TargetTransformInfo::TCK_SizeAndLatency);
    NonFreeMat |= MatCost != TTI.TCC_Free;
  }
  if (!NonFreeMat) {
    LLVM_DEBUG(dbgs() << "    Free: " << PN << "\n");
    // No profit in free materialization.
    return false;
  }

  // Now check that the uses of this PHI can actually be speculated,
  // otherwise we'll still have to materialize the PHI value.
  if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
    LLVM_DEBUG(dbgs() << "    Unsafe PHI: " << PN << "\n");
    return false;
  }

  // Compute how much (if any) savings are available by speculating around this
  // PHI.
  for (Use &U : PN.uses()) {
    auto *UserI = cast<Instruction>(U.getUser());
    // Now check whether there is any savings to folding the incoming constants
    // into this use.
    unsigned Idx = U.getOperandNo();

    // If we have a binary operator that is commutative, an actual constant
    // operand would end up on the RHS, so pretend the use of the PHI is on the
    // RHS.
    //
    // Technically, this is a bit weird if *both* operands are PHIs we're
    // speculating. But if that is the case, giving an "optimistic" cost isn't
    // a bad thing because after speculation it will constant fold. And
    // moreover, such cases should likely have been constant folded already by
    // some other pass, so we shouldn't worry about "modeling" them terribly
    // accurately here. Similarly, if the other operand is a constant, it still
    // seems fine to be "optimistic" in our cost modeling, because when the
    // incoming operand from the PHI node is also a constant, we will end up
    // constant folding.
    if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1)
      // Assume we will commute the constant to the RHS to be canonical.
      Idx = 1;

    // Get the intrinsic ID if this user is an intrinsic.
    Intrinsic::ID IID = Intrinsic::not_intrinsic;
    if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
      IID = UserII->getIntrinsicID();

    for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) {
      ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first;
      int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
      int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
      if (IID)
        FoldedCost +=
          TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(),
                                  IncomingC->getType(),
                                  TargetTransformInfo::TCK_SizeAndLatency);
      else
        FoldedCost +=
            TTI.getIntImmCostInst(UserI->getOpcode(), Idx,
                                  IncomingC->getValue(), IncomingC->getType(),
                                  TargetTransformInfo::TCK_SizeAndLatency);

      // If we accumulate more folded cost for this incoming constant than
      // materialized cost, then we'll regress any edge with this constant so
      // just bail. We're only interested in cases where folding the incoming
      // constants is at least break-even on all paths.
      if (FoldedCost > MatCost) {
        LLVM_DEBUG(dbgs() << "  Not profitable to fold imm: " << *IncomingC
                          << "\n"
                             "    Materializing cost:    "
                          << MatCost
                          << "\n"
                             "    Accumulated folded cost: "
                          << FoldedCost << "\n");
        return false;
      }
    }
  }

  // Compute the total cost savings afforded by this PHI node.
  int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free;
  for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) {
    int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
    int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
    int Count = IncomingConstantAndCostsAndCount.second.Count;

    TotalMatCost += MatCost * Count;
    TotalFoldedCost += FoldedCost * Count;
  }
  assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is "
                                            "less that its materialized cost, "
                                            "the sum must be as well.");

  LLVM_DEBUG(dbgs() << "    Cost savings " << (TotalMatCost - TotalFoldedCost)
                    << ": " << PN << "\n");
  CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
  return true;
}

/// Simple helper to walk all the users of a list of phis depth first, and call
/// a visit function on each one in post-order.
///
/// All of the PHIs should be in the same basic block, and this is primarily
/// used to make a single depth-first walk across their collective users
/// without revisiting any subgraphs. Callers should provide a fast, idempotent
/// callable to test whether a node has been visited and the more important
/// callable to actually visit a particular node.
///
/// Depth-first and postorder here refer to the *operand* graph -- we start
/// from a collection of users of PHI nodes and walk "up" the operands
/// depth-first.
template <typename IsVisitedT, typename VisitT>
static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs,
                                            IsVisitedT IsVisited,
                                            VisitT Visit) {
  SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
  for (auto *PN : PNs)
    for (Use &U : PN->uses()) {
      auto *UI = cast<Instruction>(U.getUser());
      if (IsVisited(UI))
        // Already visited this user, continue across the roots.
        continue;

      // Otherwise, walk the operand graph depth-first and visit each
      // dependency in postorder.
      DFSStack.push_back({UI, UI->value_op_begin()});
      do {
        User::value_op_iterator OpIt;
        std::tie(UI, OpIt) = DFSStack.pop_back_val();
        while (OpIt != UI->value_op_end()) {
          auto *OpI = dyn_cast<Instruction>(*OpIt);
          // Increment to the next operand for whenever we continue.
          ++OpIt;
          // No need to visit non-instructions, which can't form dependencies,
          // or instructions outside of our potential dependency set that we
          // were given. Finally, if we've already visited the node, continue
          // to the next.
          if (!OpI || IsVisited(OpI))
            continue;

          // Push onto the stack and descend. We can directly continue this
          // loop when ascending.
          DFSStack.push_back({UI, OpIt});
          UI = OpI;
          OpIt = OpI->value_op_begin();
        }

        // Finished visiting children, visit this node.
        assert(!IsVisited(UI) && "Should not have already visited a node!");
        Visit(UI);
      } while (!DFSStack.empty());
    }
}

/// Find profitable PHIs to speculate.
///
/// For a PHI node to be profitable, we need the cost of speculating its users
/// (and their dependencies) to not exceed the savings of folding the PHI's
/// constant operands into the speculated users.
///
/// Computing this is surprisingly challenging. Because users of two different
/// PHI nodes can depend on each other or on common other instructions, it may
/// be profitable to speculate two PHI nodes together even though neither one
/// in isolation is profitable. The straightforward way to find all the
/// profitable PHIs would be to check each combination of PHIs' cost, but this
/// is exponential in complexity.
///
/// Even if we assume that we only care about cases where we can consider each
/// PHI node in isolation (rather than considering cases where none are
/// profitable in isolation but some subset are profitable as a set), we still
/// have a challenge. The obvious way to find all individually profitable PHIs
/// is to iterate until reaching a fixed point, but this will be quadratic in
/// complexity. =/
///
/// This code currently uses a linear-to-compute order for a greedy approach.
/// It won't find cases where a set of PHIs must be considered together, but it
/// handles most cases of order dependence without quadratic iteration. The
/// specific order used is the post-order across the operand DAG. When the last
/// user of a PHI is visited in this postorder walk, we check it for
/// profitability.
///
/// There is an orthogonal extra complexity to all of this: computing the cost
/// itself can easily become a linear computation making everything again (at
/// best) quadratic. Using a postorder over the operand graph makes it
/// particularly easy to avoid this through dynamic programming. As we do the
/// postorder walk, we build the transitive cost of that subgraph. It is also
/// straightforward to then update these costs when we mark a PHI for
/// speculation so that subsequent PHIs don't re-pay the cost of already
/// speculated instructions.
static SmallVector<PHINode *, 16>
findProfitablePHIs(ArrayRef<PHINode *> PNs,
                   const SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
                   const SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
                   int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) {
  SmallVector<PHINode *, 16> SpecPNs;

  // First, establish a reverse mapping from immediate users of the PHI nodes
  // to the nodes themselves, and count how many users each PHI node has in
  // a way we can update while processing them.
  SmallDenseMap<Instruction *, TinyPtrVector<PHINode *>, 16> UserToPNMap;
  SmallDenseMap<PHINode *, int, 16> PNUserCountMap;
  SmallPtrSet<Instruction *, 16> UserSet;
  for (auto *PN : PNs) {
    assert(UserSet.empty() && "Must start with an empty user set!");
    for (Use &U : PN->uses())
      UserSet.insert(cast<Instruction>(U.getUser()));
    PNUserCountMap[PN] = UserSet.size();
    for (auto *UI : UserSet)
      UserToPNMap.insert({UI, {}}).first->second.push_back(PN);
    UserSet.clear();
  }

  // Now do a DFS across the operand graph of the users, computing cost as we
  // go and when all costs for a given PHI are known, checking that PHI for
  // profitability.
  SmallDenseMap<Instruction *, int, 16> SpecCostMap;
  visitPHIUsersAndDepsInPostOrder(
      PNs,
      /*IsVisited*/
      [&](Instruction *I) {
        // We consider anything that isn't potentially speculated to be
        // "visited" as it is already handled. Similarly, anything that *is*
        // potentially speculated but for which we have an entry in our cost
        // map, we're done.
        return !PotentialSpecSet.count(I) || SpecCostMap.count(I);
      },
      /*Visit*/
      [&](Instruction *I) {
        // We've fully visited the operands, so sum their cost with this node
        // and update the cost map.
        int Cost = TTI.TCC_Free;
        for (Value *OpV : I->operand_values())
          if (auto *OpI = dyn_cast<Instruction>(OpV)) {
            auto CostMapIt = SpecCostMap.find(OpI);
            if (CostMapIt != SpecCostMap.end())
              Cost += CostMapIt->second;
          }
        Cost += TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
        bool Inserted = SpecCostMap.insert({I, Cost}).second;
        (void)Inserted;
        assert(Inserted && "Must not re-insert a cost during the DFS!");

        // Now check if this node had a corresponding PHI node using it. If so,
        // we need to decrement the outstanding user count for it.
        auto UserPNsIt = UserToPNMap.find(I);
        if (UserPNsIt == UserToPNMap.end())
          return;
        auto &UserPNs = UserPNsIt->second;
        auto UserPNsSplitIt = std::stable_partition(
            UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) {
              int &PNUserCount = PNUserCountMap.find(UserPN)->second;
              assert(
                  PNUserCount > 0 &&
                  "Should never re-visit a PN after its user count hits zero!");
              --PNUserCount;
              return PNUserCount != 0;
            });

        // FIXME: Rather than one at a time, we should sum the savings as the
        // cost will be completely shared.
        SmallVector<Instruction *, 16> SpecWorklist;
        for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) {
          int SpecCost = TTI.TCC_Free;
          for (Use &U : PN->uses())
            SpecCost +=
                SpecCostMap.find(cast<Instruction>(U.getUser()))->second;
          SpecCost *= (NumPreds - 1);
          // When the user count of a PHI node hits zero, we should check its
          // profitability. If profitable, we should mark it for speculation
          // and zero out the cost of everything it depends on.
          int CostSavings = CostSavingsMap.find(PN)->second;
          if (SpecCost > CostSavings) {
            LLVM_DEBUG(dbgs() << "  Not profitable, speculation cost: " << *PN
                              << "\n"
                                 "    Cost savings:     "
                              << CostSavings
                              << "\n"
                                 "    Speculation cost: "
                              << SpecCost << "\n");
            continue;
          }

          // We're going to speculate this user-associated PHI. Copy it out and
          // add its users to the worklist to update their cost.
          SpecPNs.push_back(PN);
          for (Use &U : PN->uses()) {
            auto *UI = cast<Instruction>(U.getUser());
            auto CostMapIt = SpecCostMap.find(UI);
            if (CostMapIt->second == 0)
              continue;
            // Zero out this cost entry to avoid duplicates.
            CostMapIt->second = 0;
            SpecWorklist.push_back(UI);
          }
        }

        // Now walk all the operands of the users in the worklist transitively
        // to zero out all the memoized costs.
        while (!SpecWorklist.empty()) {
          Instruction *SpecI = SpecWorklist.pop_back_val();
          assert(SpecCostMap.find(SpecI)->second == 0 &&
                 "Didn't zero out a cost!");

          // Walk the operands recursively to zero out their cost as well.
          for (auto *OpV : SpecI->operand_values()) {
            auto *OpI = dyn_cast<Instruction>(OpV);
            if (!OpI)
              continue;
            auto CostMapIt = SpecCostMap.find(OpI);
            if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0)
              continue;
            CostMapIt->second = 0;
            SpecWorklist.push_back(OpI);
          }
        }
      });

  return SpecPNs;
}

/// Speculate users around a set of PHI nodes.
///
/// This routine does the actual speculation around a set of PHI nodes where we
/// have determined this to be both safe and profitable.
///
/// This routine handles any spliting of critical edges necessary to create
/// a safe block to speculate into as well as cloning the instructions and
/// rewriting all uses.
static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
                          SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
                          SmallSetVector<BasicBlock *, 16> &PredSet,
                          DominatorTree &DT) {
  LLVM_DEBUG(dbgs() << "  Speculating around " << SpecPNs.size() << " PHIs!\n");
  NumPHIsSpeculated += SpecPNs.size();

  // Split any critical edges so that we have a block to hoist into.
  auto *ParentBB = SpecPNs[0]->getParent();
  SmallVector<BasicBlock *, 16> SpecPreds;
  SpecPreds.reserve(PredSet.size());
  for (auto *PredBB : PredSet) {
    auto *NewPredBB = SplitCriticalEdge(
        PredBB, ParentBB,
        CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
    if (NewPredBB) {
      ++NumEdgesSplit;
      LLVM_DEBUG(dbgs() << "  Split critical edge from: " << PredBB->getName()
                        << "\n");
      SpecPreds.push_back(NewPredBB);
    } else {
      assert(PredBB->getSingleSuccessor() == ParentBB &&
             "We need a non-critical predecessor to speculate into.");
      assert(!isa<InvokeInst>(PredBB->getTerminator()) &&
             "Cannot have a non-critical invoke!");

      // Already non-critical, use existing pred.
      SpecPreds.push_back(PredBB);
    }
  }

  SmallPtrSet<Instruction *, 16> SpecSet;
  SmallVector<Instruction *, 16> SpecList;
  visitPHIUsersAndDepsInPostOrder(SpecPNs,
                                  /*IsVisited*/
                                  [&](Instruction *I) {
                                    // This is visited if we don't need to
                                    // speculate it or we already have
                                    // speculated it.
                                    return !PotentialSpecSet.count(I) ||
                                           SpecSet.count(I);
                                  },
                                  /*Visit*/
                                  [&](Instruction *I) {
                                    // All operands scheduled, schedule this
                                    // node.
                                    SpecSet.insert(I);
                                    SpecList.push_back(I);
                                  });

  int NumSpecInsts = SpecList.size() * SpecPreds.size();
  int NumRedundantInsts = NumSpecInsts - SpecList.size();
  LLVM_DEBUG(dbgs() << "  Inserting " << NumSpecInsts
                    << " speculated instructions, " << NumRedundantInsts
                    << " redundancies\n");
  NumSpeculatedInstructions += NumSpecInsts;
  NumNewRedundantInstructions += NumRedundantInsts;

  // Each predecessor is numbered by its index in `SpecPreds`, so for each
  // instruction we speculate, the speculated instruction is stored in that
  // index of the vector associated with the original instruction. We also
  // store the incoming values for each predecessor from any PHIs used.
  SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;

  // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming
  // value. This handles both the PHIs we are speculating around and any other
  // PHIs that happen to be used.
  for (auto *OrigI : SpecList)
    for (auto *OpV : OrigI->operand_values()) {
      auto *OpPN = dyn_cast<PHINode>(OpV);
      if (!OpPN || OpPN->getParent() != ParentBB)
        continue;

      auto InsertResult = SpeculatedValueMap.insert({OpPN, {}});
      if (!InsertResult.second)
        continue;

      auto &SpeculatedVals = InsertResult.first->second;

      // Populating our structure for mapping is particularly annoying because
      // finding an incoming value for a particular predecessor block in a PHI
      // node is a linear time operation! To avoid quadratic behavior, we build
      // a map for this PHI node's incoming values and then translate it into
      // the more compact representation used below.
      SmallDenseMap<BasicBlock *, Value *, 16> IncomingValueMap;
      for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues()))
        IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i);

      for (auto *PredBB : SpecPreds)
        SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second);
    }

  // Speculate into each predecessor.
  for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) {
    auto *PredBB = SpecPreds[PredIdx];
    assert(PredBB->getSingleSuccessor() == ParentBB &&
           "We need a non-critical predecessor to speculate into.");

    for (auto *OrigI : SpecList) {
      auto *NewI = OrigI->clone();
      NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx));
      NewI->insertBefore(PredBB->getTerminator());

      // Rewrite all the operands to the previously speculated instructions.
      // Because we're walking in-order, the defs must precede the uses and we
      // should already have these mappings.
      for (Use &U : NewI->operands()) {
        auto *OpI = dyn_cast<Instruction>(U.get());
        if (!OpI)
          continue;
        auto MapIt = SpeculatedValueMap.find(OpI);
        if (MapIt == SpeculatedValueMap.end())
          continue;
        const auto &SpeculatedVals = MapIt->second;
        assert(SpeculatedVals[PredIdx] &&
               "Must have a speculated value for this predecessor!");
        assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() &&
               "Speculated value has the wrong type!");

        // Rewrite the use to this predecessor's speculated instruction.
        U.set(SpeculatedVals[PredIdx]);
      }

      // Commute instructions which now have a constant in the LHS but not the
      // RHS.
      if (NewI->isBinaryOp() && NewI->isCommutative() &&
          isa<Constant>(NewI->getOperand(0)) &&
          !isa<Constant>(NewI->getOperand(1)))
        NewI->getOperandUse(0).swap(NewI->getOperandUse(1));

      SpeculatedValueMap[OrigI].push_back(NewI);
      assert(SpeculatedValueMap[OrigI][PredIdx] == NewI &&
             "Mismatched speculated instruction index!");
    }
  }

  // Walk the speculated instruction list and if they have uses, insert a PHI
  // for them from the speculated versions, and replace the uses with the PHI.
  // Then erase the instructions as they have been fully speculated. The walk
  // needs to be in reverse so that we don't think there are users when we'll
  // actually eventually remove them later.
  IRBuilder<> IRB(SpecPNs[0]);
  for (auto *OrigI : llvm::reverse(SpecList)) {
    // Check if we need a PHI for any remaining users and if so, insert it.
    if (!OrigI->use_empty()) {
      auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(),
                                    Twine(OrigI->getName()) + ".phi");
      // Add the incoming values we speculated.
      auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second;
      for (int PredIdx : llvm::seq<int>(0, SpecPreds.size()))
        SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]);

      // And replace the uses with the PHI node.
      OrigI->replaceAllUsesWith(SpecIPN);
    }

    // It is important to immediately erase this so that it stops using other
    // instructions. This avoids inserting needless PHIs of them.
    OrigI->eraseFromParent();
  }

  // All of the uses of the speculated phi nodes should be removed at this
  // point, so erase them.
  for (auto *SpecPN : SpecPNs) {
    assert(SpecPN->use_empty() && "All users should have been speculated!");
    SpecPN->eraseFromParent();
  }
}

/// Try to speculate around a series of PHIs from a single basic block.
///
/// This routine checks whether any of these PHIs are profitable to speculate
/// users around. If safe and profitable, it does the speculation. It returns
/// true when at least some speculation occurs.
static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
                               DominatorTree &DT, TargetTransformInfo &TTI) {
  LLVM_DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");

  // Savings in cost from speculating around a PHI node.
  SmallDenseMap<PHINode *, int, 16> CostSavingsMap;

  // Remember the set of instructions that are candidates for speculation so
  // that we can quickly walk things within that space. This prunes out
  // instructions already available along edges, etc.
  SmallPtrSet<Instruction *, 16> PotentialSpecSet;

  // Remember the set of instructions that are (transitively) unsafe to
  // speculate into the incoming edges of this basic block. This avoids
  // recomputing them for each PHI node we check. This set is specific to this
  // block though as things are pruned out of it based on what is available
  // along incoming edges.
  SmallPtrSet<Instruction *, 16> UnsafeSet;

  // For each PHI node in this block, check whether there are immediate folding
  // opportunities from speculation, and whether that speculation will be
  // valid. This determise the set of safe PHIs to speculate.
  PNs.erase(llvm::remove_if(PNs,
                            [&](PHINode *PN) {
                              return !isSafeAndProfitableToSpeculateAroundPHI(
                                  *PN, CostSavingsMap, PotentialSpecSet,
                                  UnsafeSet, DT, TTI);
                            }),
            PNs.end());
  // If no PHIs were profitable, skip.
  if (PNs.empty()) {
    LLVM_DEBUG(dbgs() << "  No safe and profitable PHIs found!\n");
    return false;
  }

  // We need to know how much speculation will cost which is determined by how
  // many incoming edges will need a copy of each speculated instruction.
  SmallSetVector<BasicBlock *, 16> PredSet;
  for (auto *PredBB : PNs[0]->blocks()) {
    if (!PredSet.insert(PredBB))
      continue;

    // We cannot speculate when a predecessor is an indirect branch.
    // FIXME: We also can't reliably create a non-critical edge block for
    // speculation if the predecessor is an invoke. This doesn't seem
    // fundamental and we should probably be splitting critical edges
    // differently.
    const auto *TermInst = PredBB->getTerminator();
    if (isa<IndirectBrInst>(TermInst) ||
        isa<InvokeInst>(TermInst) ||
        isa<CallBrInst>(TermInst)) {
      LLVM_DEBUG(dbgs() << "  Invalid: predecessor terminator: "
                        << PredBB->getName() << "\n");
      return false;
    }
  }
  if (PredSet.size() < 2) {
    LLVM_DEBUG(dbgs() << "  Unimportant: phi with only one predecessor\n");
    return false;
  }

  SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs(
      PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI);
  if (SpecPNs.empty())
    // Nothing to do.
    return false;

  speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT);
  return true;
}

PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F,
                                               FunctionAnalysisManager &AM) {
  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
  auto &TTI = AM.getResult<TargetIRAnalysis>(F);

  bool Changed = false;
  for (auto *BB : ReversePostOrderTraversal<Function *>(&F)) {
    SmallVector<PHINode *, 16> PNs;
    auto BBI = BB->begin();
    while (auto *PN = dyn_cast<PHINode>(&*BBI)) {
      PNs.push_back(PN);
      ++BBI;
    }

    if (PNs.empty())
      continue;

    Changed |= tryToSpeculatePHIs(PNs, DT, TTI);
  }

  if (!Changed)
    return PreservedAnalyses::all();

  PreservedAnalyses PA;
  return PA;
}