59 files changed, 4346 insertions, 3646 deletions
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index e5c68e598479..b2865f1a0f9e 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -10,11 +10,11 @@ tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(WebAssemblyCommonTableGen)
 
 add_llvm_target(WebAssemblyCodeGen
-  Relooper.cpp
   WebAssemblyArgumentMove.cpp
   WebAssemblyAsmPrinter.cpp
   WebAssemblyCFGStackify.cpp
   WebAssemblyFastISel.cpp
+  WebAssemblyFixIrreducibleControlFlow.cpp
   WebAssemblyFrameLowering.cpp
   WebAssemblyISelDAGToDAG.cpp
   WebAssemblyISelLowering.cpp
@@ -22,14 +22,17 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyLowerBrUnless.cpp
   WebAssemblyMachineFunctionInfo.cpp
   WebAssemblyMCInstLower.cpp
+  WebAssemblyOptimizeLiveIntervals.cpp
   WebAssemblyOptimizeReturned.cpp
   WebAssemblyPeephole.cpp
-  WebAssemblyPEI.cpp
+  WebAssemblyPrepareForLiveIntervals.cpp
   WebAssemblyRegisterInfo.cpp
   WebAssemblyRegColoring.cpp
   WebAssemblyRegNumbering.cpp
   WebAssemblyRegStackify.cpp
+  WebAssemblyReplacePhysRegs.cpp
   WebAssemblySelectionDAGInfo.cpp
+  WebAssemblySetP2AlignOperands.cpp
   WebAssemblyStoreResults.cpp
   WebAssemblySubtarget.cpp
   WebAssemblyTargetMachine.cpp
diff --git a/lib/Target/WebAssembly/Disassembler/Makefile b/lib/Target/WebAssembly/Disassembler/Makefile
deleted file mode 100644
index bcd36ba6f01f..000000000000
--- a/lib/Target/WebAssembly/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===-- lib/Target/WebAssembly/Disassembler/Makefile -------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMWebAssemblyDisassembler
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 0143b10c0ab1..c0355aef0b35 100644
--- a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -18,7 +18,7 @@
 #include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -93,6 +93,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
     const MCOperandInfo &Info = Desc.OpInfo[i];
     switch (Info.OperandType) {
     case MCOI::OPERAND_IMMEDIATE:
+    case WebAssembly::OPERAND_P2ALIGN:
     case WebAssembly::OPERAND_BASIC_BLOCK: {
       if (Pos + sizeof(uint64_t) > Bytes.size())
         return MCDisassembler::Fail;
@@ -109,7 +110,8 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
       MI.addOperand(MCOperand::createReg(Reg));
       break;
     }
-    case WebAssembly::OPERAND_FPIMM: {
+    case WebAssembly::OPERAND_FP32IMM:
+    case WebAssembly::OPERAND_FP64IMM: {
       // TODO: MC converts all floating point immediate operands to double.
       // This is fine for numeric values, but may cause NaNs to change bits.
       if (Pos + sizeof(uint64_t) > Bytes.size())
diff --git a/lib/Target/WebAssembly/InstPrinter/Makefile b/lib/Target/WebAssembly/InstPrinter/Makefile
deleted file mode 100644
index 87534379f796..000000000000
--- a/lib/Target/WebAssembly/InstPrinter/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/WebAssembly/AsmPrinter/Makefile ----------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMWebAssemblyAsmPrinter
-
-# Hack: we need to include 'main' wasm target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 9a95150cb557..267d716dd1d0 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -110,14 +110,22 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
 }
 
 static std::string toString(const APFloat &FP) {
+  // Print NaNs with custom payloads specially.
+  if (FP.isNaN() &&
+      !FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics())) &&
+      !FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics(), /*Negative=*/true))) {
+    APInt AI = FP.bitcastToAPInt();
+    return
+        std::string(AI.isNegative() ? "-" : "") + "nan:0x" +
+        utohexstr(AI.getZExtValue() &
+                  (AI.getBitWidth() == 32 ? INT64_C(0x007fffff) :
+                                            INT64_C(0x000fffffffffffff)),
+                  /*LowerCase=*/true);
+  }
+
+  // Use C99's hexadecimal floating-point representation.
   static const size_t BufBytes = 128;
   char buf[BufBytes];
-  if (FP.isNaN())
-    assert((FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics())) ||
-            FP.bitwiseIsEqual(
-                APFloat::getQNaN(FP.getSemantics(), /*Negative=*/true))) &&
-           "convertToHexString handles neither SNaN nor NaN payloads");
-  // Use C99's hexadecimal floating-point representation.
   auto Written = FP.convertToHexString(
       buf, /*hexDigits=*/0, /*upperCase=*/false, APFloat::rmNearestTiesToEven);
   (void)Written;
@@ -137,11 +145,11 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     if (int(WAReg) >= 0)
       printRegName(O, WAReg);
     else if (OpNo >= MII.get(MI->getOpcode()).getNumDefs())
-      O << "$pop" << (WAReg & INT32_MAX);
+      O << "$pop" << WebAssemblyFunctionInfo::getWARegStackId(WAReg);
     else if (WAReg != WebAssemblyFunctionInfo::UnusedReg)
-      O << "$push" << (WAReg & INT32_MAX);
+      O << "$push" << WebAssemblyFunctionInfo::getWARegStackId(WAReg);
     else
-      O << "$discard";
+      O << "$drop";
     // Add a '=' suffix if this is a def.
     if (OpNo < MII.get(MI->getOpcode()).getNumDefs())
       O << '=';
@@ -157,10 +165,20 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     // control flow stack, and it may be nice to pretty-print.
     O << Op.getImm();
   } else if (Op.isFPImm()) {
-    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
-            MII.get(MI->getOpcode()).TSFlags == 0) &&
+    const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+    assert(OpNo < Desc.getNumOperands() &&
+           "Unexpected floating-point immediate as a non-fixed operand");
+    assert(Desc.TSFlags == 0 &&
            "WebAssembly variable_ops floating point ops don't use TSFlags");
-    O << toString(APFloat(Op.getFPImm()));
+    const MCOperandInfo &Info = Desc.OpInfo[OpNo];
+    if (Info.OperandType == WebAssembly::OPERAND_FP32IMM) {
+      // TODO: MC converts all floating point immediate operands to double.
+      // This is fine for numeric values, but may cause NaNs to change bits.
+      O << toString(APFloat(float(Op.getFPImm())));
+    } else {
+      assert(Info.OperandType == WebAssembly::OPERAND_FP64IMM);
+      O << toString(APFloat(Op.getFPImm()));
+    }
   } else {
     assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
             (MII.get(MI->getOpcode()).TSFlags &
@@ -172,6 +190,16 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void
+WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI,
+                                                       unsigned OpNo,
+                                                       raw_ostream &O) {
+  int64_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm == WebAssembly::GetDefaultP2Align(MI->getOpcode()))
+    return;
+  O << ":p2align=" << Imm;
+}
+
 const char *llvm::WebAssembly::TypeToString(MVT Ty) {
   switch (Ty.SimpleTy) {
   case MVT::i32:
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index cd6c59a41c33..07b0f914e447 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -15,8 +15,9 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
 
-#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
 
@@ -36,6 +37,8 @@ public:
 
   // Used by tblegen code.
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printWebAssemblyP2AlignOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O);
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
diff --git a/lib/Target/WebAssembly/MCTargetDesc/Makefile b/lib/Target/WebAssembly/MCTargetDesc/Makefile
deleted file mode 100644
index 11dcb4ff6075..000000000000
--- a/lib/Target/WebAssembly/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/WebAssembly/TargetDesc/Makefile ----------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMWebAssemblyDesc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index bba06f65e169..df6fb8968d56 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -55,7 +55,8 @@ public:
 
   bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
 
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {}
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 };
@@ -73,8 +74,10 @@ void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                        unsigned DataSize, uint64_t Value,
                                        bool IsPCRel) const {
   const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
-  unsigned NumBytes = RoundUpToAlignment(Info.TargetSize, 8);
-  if (!Value)
+  assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
+
+  unsigned NumBytes = (Info.TargetSize + 7) / 8;
+  if (Value == 0)
     return; // Doesn't change encoding.
 
   // Shift the value into position.
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
index 2bb58b33934e..2146f67959b8 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
@@ -25,8 +25,8 @@ public:
   WebAssemblyELFObjectWriter(bool Is64Bit, uint8_t OSABI);
 
 protected:
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override;
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
 };
 } // end anonymous namespace
 
@@ -35,7 +35,8 @@ WebAssemblyELFObjectWriter::WebAssemblyELFObjectWriter(bool Is64Bit,
     : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_WEBASSEMBLY,
                               /*HasRelocationAddend=*/false) {}
 
-unsigned WebAssemblyELFObjectWriter::GetRelocType(const MCValue &Target,
+unsigned WebAssemblyELFObjectWriter::getRelocType(MCContext &Ctx,
+                                                  const MCValue &Target,
                                                   const MCFixup &Fixup,
                                                   bool IsPCRel) const {
   // WebAssembly functions are not allocated in the address space. To resolve a
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index 02c717a92101..d8c39216c53b 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -15,7 +15,6 @@
 
 #include "WebAssemblyMCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-mc-asm-info"
@@ -48,4 +47,7 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
   ExceptionsType = ExceptionHandling::None;
 
   // TODO: UseIntegratedAssembler?
+
+  // WebAssembly's stack is never executable.
+  UsesNonexecutableStackSection = false;
 }
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index f409bd77442c..23f8b3d0e827 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -33,7 +34,6 @@ STATISTIC(MCNumFixups, "Number of MC fixups created.");
 namespace {
 class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
   const MCInstrInfo &MCII;
-  const MCContext &Ctx;
 
   // Implementation generated by tablegen.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -45,14 +45,12 @@ class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
                          const MCSubtargetInfo &STI) const override;
 
 public:
-  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
-      : MCII(mcii), Ctx(ctx) {}
+  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
 };
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
-                                                    MCContext &Ctx) {
-  return new WebAssemblyMCCodeEmitter(MCII, Ctx);
+MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII) {
+  return new WebAssemblyMCCodeEmitter(MCII);
 }
 
 void WebAssemblyMCCodeEmitter::encodeInstruction(
@@ -78,7 +76,8 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
       support::endian::Writer<support::little>(OS).write<uint64_t>(0);
       Fixups.push_back(MCFixup::create(
           (1 + MCII.get(MI.getOpcode()).isVariadic() + i) * sizeof(uint64_t),
-          MO.getExpr(), STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4,
+          MO.getExpr(),
+          STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4,
           MI.getLoc()));
       ++MCNumFixups;
     } else {
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 37000f1cd571..ac11a64086f2 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -16,7 +16,6 @@
 #include "InstPrinter/WebAssemblyInstPrinter.h"
 #include "WebAssemblyMCAsmInfo.h"
 #include "WebAssemblyTargetStreamer.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -40,6 +39,15 @@ static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
   return new WebAssemblyMCAsmInfo(TT);
 }
 
+static void adjustCodeGenOpts(const Triple & /*TT*/, Reloc::Model /*RM*/,
+                              CodeModel::Model &CM) {
+  CodeModel::Model M = (CM == CodeModel::Default || CM == CodeModel::JITDefault)
+                           ? CodeModel::Large
+                           : CM;
+  if (M != CodeModel::Large)
+    report_fatal_error("Non-large code models are not supported yet");
+}
+
 static MCInstrInfo *createMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitWebAssemblyMCInstrInfo(X);
@@ -57,14 +65,14 @@ static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
                                           const MCAsmInfo &MAI,
                                           const MCInstrInfo &MII,
                                           const MCRegisterInfo &MRI) {
-  assert(SyntaxVariant == 0);
+  assert(SyntaxVariant == 0 && "WebAssembly only has one syntax variant");
   return new WebAssemblyInstPrinter(MAI, MII, MRI);
 }
 
 static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo & /*MRI*/,
-                                        MCContext &Ctx) {
-  return createWebAssemblyMCCodeEmitter(MCII, Ctx);
+                                        MCContext & /*Ctx*/) {
+  return createWebAssemblyMCCodeEmitter(MCII);
 }
 
 static MCAsmBackend *createAsmBackend(const Target & /*T*/,
@@ -99,6 +107,9 @@ extern "C" void LLVMInitializeWebAssemblyTargetMC() {
     // Register the MC instruction info.
     TargetRegistry::RegisterMCInstrInfo(*T, createMCInstrInfo);
 
+    // Register the MC codegen info.
+    TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
+
     // Register the MC register info.
     TargetRegistry::RegisterMCRegInfo(*T, createMCRegisterInfo);
 
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 9bac4f82822a..001bd7f1fc43 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -33,8 +33,7 @@ class raw_pwrite_stream;
 extern Target TheWebAssemblyTarget32;
 extern Target TheWebAssemblyTarget64;
 
-MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
-                                              MCContext &Ctx);
+MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
 
 MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
 
@@ -45,8 +44,12 @@ namespace WebAssembly {
 enum OperandType {
   /// Basic block label in a branch construct.
   OPERAND_BASIC_BLOCK = MCOI::OPERAND_FIRST_TARGET,
-  /// Floating-point immediate.
-  OPERAND_FPIMM
+  /// 32-bit floating-point immediates.
+  OPERAND_FP32IMM,
+  /// 64-bit floating-point immediates.
+  OPERAND_FP64IMM,
+  /// p2align immediate for load and store address alignment.
+  OPERAND_P2ALIGN
 };
 
 /// WebAssembly-specific directive identifiers.
@@ -87,4 +90,49 @@ enum {
 #define GET_SUBTARGETINFO_ENUM
 #include "WebAssemblyGenSubtargetInfo.inc"
 
+namespace llvm {
+namespace WebAssembly {
+
+/// Return the default p2align value for a load or store with the given opcode.
+inline unsigned GetDefaultP2Align(unsigned Opcode) {
+  switch (Opcode) {
+  case WebAssembly::LOAD8_S_I32:
+  case WebAssembly::LOAD8_U_I32:
+  case WebAssembly::LOAD8_S_I64:
+  case WebAssembly::LOAD8_U_I64:
+  case WebAssembly::STORE8_I32:
+  case WebAssembly::STORE8_I64:
+    return 0;
+  case WebAssembly::LOAD16_S_I32:
+  case WebAssembly::LOAD16_U_I32:
+  case WebAssembly::LOAD16_S_I64:
+  case WebAssembly::LOAD16_U_I64:
+  case WebAssembly::STORE16_I32:
+  case WebAssembly::STORE16_I64:
+    return 1;
+  case WebAssembly::LOAD_I32:
+  case WebAssembly::LOAD_F32:
+  case WebAssembly::STORE_I32:
+  case WebAssembly::STORE_F32:
+  case WebAssembly::LOAD32_S_I64:
+  case WebAssembly::LOAD32_U_I64:
+  case WebAssembly::STORE32_I64:
+    return 2;
+  case WebAssembly::LOAD_I64:
+  case WebAssembly::LOAD_F64:
+  case WebAssembly::STORE_I64:
+  case WebAssembly::STORE_F64:
+    return 3;
+  default: llvm_unreachable("Only loads and stores have p2align values");
+  }
+}
+
+/// The operand number of the load or store address in load/store instructions.
+static const unsigned MemOpAddressOperandNo = 2;
+/// The operand number of the stored value in a store instruction.
+static const unsigned StoreValueOperandNo = 4;
+
+} // end namespace WebAssembly
+} // end namespace llvm
+
 #endif
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 1d2822869a15..3d61c15717b4 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -16,12 +16,10 @@
 #include "WebAssemblyTargetStreamer.h"
 #include "InstPrinter/WebAssemblyInstPrinter.h"
 #include "WebAssemblyMCTargetDesc.h"
-#include "WebAssemblyTargetObjectFile.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
@@ -66,6 +64,16 @@ void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
 
 void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
 
+void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType(
+    StringRef name, SmallVectorImpl<MVT> &SignatureVTs, size_t NumResults) {
+  OS << "\t.functype\t" << name;
+  if (NumResults == 0) OS << ", void";
+  for (auto Ty : SignatureVTs) {
+    OS << ", " << WebAssembly::TypeToString(Ty);
+  }
+  OS << "\n";
+}
+
 // FIXME: What follows is not the real binary encoding.
 
 static void EncodeTypes(MCStreamer &Streamer, ArrayRef<MVT> Types) {
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index c66a51574efb..51354ef22d71 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -37,6 +37,12 @@ public:
   virtual void emitLocal(ArrayRef<MVT> Types) = 0;
   /// .endfunc
   virtual void emitEndFunc() = 0;
+  /// .functype
+  virtual void emitIndirectFunctionType(StringRef name,
+                                        SmallVectorImpl<MVT> &SignatureVTs,
+                                        size_t NumResults) {
+    llvm_unreachable("emitIndirectFunctionType not implemented");
+  }
 };
 
 /// This part is for ascii assembly output
@@ -50,6 +56,9 @@ public:
   void emitResult(ArrayRef<MVT> Types) override;
   void emitLocal(ArrayRef<MVT> Types) override;
   void emitEndFunc() override;
+  void emitIndirectFunctionType(StringRef name,
+                                SmallVectorImpl<MVT> &SignatureVTs,
+                                size_t NumResults) override;
 };
 
 /// This part is for ELF object output
diff --git a/lib/Target/WebAssembly/Makefile b/lib/Target/WebAssembly/Makefile
deleted file mode 100644
index c501a2b1ab15..000000000000
--- a/lib/Target/WebAssembly/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-##===- lib/Target/WebAssembly/Makefile ---------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMWebAssemblyCodeGen
-TARGET = WebAssembly
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = \
-	WebAssemblyGenAsmWriter.inc \
-	WebAssemblyGenDAGISel.inc \
-	WebAssemblyGenFastISel.inc \
-	WebAssemblyGenInstrInfo.inc \
-	WebAssemblyGenMCCodeEmitter.inc \
-	WebAssemblyGenRegisterInfo.inc \
-	WebAssemblyGenSubtargetInfo.inc
-
-DIRS = InstPrinter TargetInfo MCTargetDesc Disassembler
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index b97ea454165c..a6c2eefc0578 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -13,32 +13,18 @@ binary encoding of WebAssembly itself:
   * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
 
 The backend is built, tested and archived on the following waterfall:
-  https://build.chromium.org/p/client.wasm.llvm/console
+  https://wasm-stat.us
 
 The backend's bringup is done using the GCC torture test suite first since it
 doesn't require C library support. Current known failures are in
 known_gcc_test_failures.txt, all other tests should pass. The waterfall will
 turn red if not. Once most of these pass, further testing will use LLVM's own
 test suite. The tests can be run locally using:
-  github.com/WebAssembly/experimental/blob/master/buildbot/torture_test.py
-
-Interesting work that remains to be done:
-* Write a pass to restructurize irreducible control flow. This needs to be done
-  before register allocation to be efficient, because it may duplicate basic
-  blocks and WebAssembly performs register allocation at a whole-function
-  level. Note that LLVM's GPU code has such a pass, but it linearizes control
-  flow (e.g. both sides of branches execute and are masked) which is undesirable
-  for WebAssembly.
+  https://github.com/WebAssembly/waterfall/blob/master/src/compile_torture_tests.py
 
 //===---------------------------------------------------------------------===//
 
-set_local instructions have a return value. We should (a) model this,
-and (b) write optimizations which take advantage of it. Keep in mind that
-many set_local instructions are implicit!
-
-//===---------------------------------------------------------------------===//
-
-Br, br_if, and tableswitch instructions can support having a value on the
+Br, br_if, and br_table instructions can support having a value on the
 expression stack across the jump (sometimes). We should (a) model this, and
 (b) extend the stackifier to utilize it.
 
@@ -58,10 +44,6 @@ us too?
 
 //===---------------------------------------------------------------------===//
 
-When is it profitable to set isAsCheapAsAMove on instructions in WebAssembly?
-
-//===---------------------------------------------------------------------===//
-
 Register stackification uses the EXPR_STACK physical register to impose
 ordering dependencies on instructions with stack operands. This is pessimistic;
 we should consider alternate ways to model stack dependencies.
@@ -82,7 +64,74 @@ stores.
 
 //===---------------------------------------------------------------------===//
 
-Memset/memcpy/memmove should be marked with the "returned" attribute somehow,
-even when they are translated through intrinsics.
+Consider implementing optimizeSelect, optimizeCompareInstr, optimizeCondBranch,
+optimizeLoadInstr, and/or getMachineCombinerPatterns.
+
+//===---------------------------------------------------------------------===//
+
+Find a clean way to fix the problem which leads to the Shrink Wrapping pass
+being run after the WebAssembly PEI pass.
+
+//===---------------------------------------------------------------------===//
+
+When setting multiple local variables to the same constant, we currently get
+code like this:
+
+    i32.const   $4=, 0
+    i32.const   $3=, 0
+
+It could be done with a smaller encoding like this:
+
+    i32.const   $push5=, 0
+    tee_local   $push6=, $4=, $pop5
+    copy_local  $3=, $pop6
+
+//===---------------------------------------------------------------------===//
+
+WebAssembly registers are implicitly initialized to zero. Explicit zeroing is
+therefore often redundant and could be optimized away.
+
+//===---------------------------------------------------------------------===//
+
+Small indices may use smaller encodings than large indices.
+WebAssemblyRegColoring and/or WebAssemblyRegRenumbering should sort registers
+according to their usage frequency to maximize the usage of smaller encodings.
+
+//===---------------------------------------------------------------------===//
+
+When the last statement in a function body computes the return value, it can
+just let that value be the exit value of the outermost block, rather than
+needing an explicit return operation.
+
+//===---------------------------------------------------------------------===//
+
+Many cases of irreducible control flow could be transformed more optimally
+than via the transform in WebAssemblyFixIrreducibleControlFlow.cpp.
+
+It may also be worthwhile to do transforms before register coloring,
+particularly when duplicating code, to allow register coloring to be aware of
+the duplication.
+
+//===---------------------------------------------------------------------===//
+
+WebAssemblyRegStackify could use AliasAnalysis to reorder loads and stores more
+aggressively.
+
+//===---------------------------------------------------------------------===//
+
+WebAssemblyRegStackify is currently a greedy algorithm. This means that, for
+example, a binary operator will stackify with its user before its operands.
+However, if moving the binary operator to its user moves it to a place where
+its operands can't be moved to, it would be better to leave it in place, or
+perhaps move it up, so that it can stackify its operands. A binary operator
+has two operands and one result, so in such cases there could be a net win by
+prefering the operands.
+
+//===---------------------------------------------------------------------===//
+
+Instruction ordering has a significant influence on register stackification and
+coloring. Consider experimenting with the MachineScheduler (enable via
+enableMachineScheduler) and determine if it can be configured to schedule
+instructions advantageously for this purpose.
 
 //===---------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/Relooper.cpp b/lib/Target/WebAssembly/Relooper.cpp
deleted file mode 100644
index 9b718ef094aa..000000000000
--- a/lib/Target/WebAssembly/Relooper.cpp
+++ /dev/null
@@ -1,984 +0,0 @@
-//===-- Relooper.cpp - Top-level interface for WebAssembly  ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This implements the Relooper algorithm. This implementation includes
-/// optimizations added since the original academic paper [1] was published.
-///
-/// [1] Alon Zakai. 2011. Emscripten: an LLVM-to-JavaScript compiler. In
-/// Proceedings of the ACM international conference companion on Object
-/// oriented programming systems languages and applications companion
-/// (SPLASH '11). ACM, New York, NY, USA, 301-312. DOI=10.1145/2048147.2048224
-/// http://doi.acm.org/10.1145/2048147.2048224
-///
-//===-------------------------------------------------------------------===//
-
-#include "Relooper.h"
-#include "WebAssembly.h"
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <cstring>
-#include <cstdlib>
-#include <functional>
-#include <list>
-#include <stack>
-#include <string>
-
-#define DEBUG_TYPE "relooper"
-
-using namespace llvm;
-using namespace Relooper;
-
-static cl::opt<int> RelooperSplittingFactor(
-    "relooper-splitting-factor",
-    cl::desc(
-        "How much to discount code size when deciding whether to split a node"),
-    cl::init(5));
-
-static cl::opt<unsigned> RelooperMultipleSwitchThreshold(
-    "relooper-multiple-switch-threshold",
-    cl::desc(
-        "How many entries to allow in a multiple before we use a switch"),
-    cl::init(10));
-
-static cl::opt<unsigned> RelooperNestingLimit(
-    "relooper-nesting-limit",
-    cl::desc(
-        "How much nesting is acceptable"),
-    cl::init(20));
-
-
-namespace {
-///
-/// Implements the relooper algorithm for a function's blocks.
-///
-/// Implementation details: The Relooper instance has
-/// ownership of the blocks and shapes, and frees them when done.
-///
-struct RelooperAlgorithm {
-  std::deque<Block *> Blocks;
-  std::deque<Shape *> Shapes;
-  Shape *Root;
-  bool MinSize;
-  int BlockIdCounter;
-  int ShapeIdCounter;
-
-  RelooperAlgorithm();
-  ~RelooperAlgorithm();
-
-  void AddBlock(Block *New, int Id = -1);
-
-  // Calculates the shapes
-  void Calculate(Block *Entry);
-
-  // Sets us to try to minimize size
-  void SetMinSize(bool MinSize_) { MinSize = MinSize_; }
-};
-
-struct RelooperAnalysis final : public FunctionPass {
-  static char ID;
-  RelooperAnalysis() : FunctionPass(ID) {}
-  const char *getPassName() const override { return "relooper"; }
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-  bool runOnFunction(Function &F) override;
-};
-}
-
-// RelooperAnalysis
-
-char RelooperAnalysis::ID = 0;
-FunctionPass *llvm::createWebAssemblyRelooper() {
-  return new RelooperAnalysis();
-}
-
-bool RelooperAnalysis::runOnFunction(Function &F) {
-  DEBUG(dbgs() << "Relooping function '" << F.getName() << "'\n");
-  RelooperAlgorithm R;
-  // FIXME: remove duplication between relooper's and LLVM's BBs.
-  std::map<const BasicBlock *, Block *> BB2B;
-  std::map<const Block *, const BasicBlock *> B2BB;
-  for (const BasicBlock &BB : F) {
-    // FIXME: getName is wrong here, Code is meant to represent amount of code.
-    // FIXME: use BranchVarInit for switch.
-    Block *B = new Block(BB.getName().str().data(), /*BranchVarInit=*/nullptr);
-    R.AddBlock(B);
-    assert(BB2B.find(&BB) == BB2B.end() && "Inserting the same block twice");
-    assert(B2BB.find(B) == B2BB.end() && "Inserting the same block twice");
-    BB2B[&BB] = B;
-    B2BB[B] = &BB;
-  }
-  for (Block *B : R.Blocks) {
-    const BasicBlock *BB = B2BB[B];
-    for (const BasicBlock *Successor : successors(BB))
-      // FIXME: add branch's Condition and Code below.
-      B->AddBranchTo(BB2B[Successor], /*Condition=*/nullptr, /*Code=*/nullptr);
-  }
-  R.Calculate(BB2B[&F.getEntryBlock()]);
-  return false; // Analysis passes don't modify anything.
-}
-
-// Helpers
-
-typedef MapVector<Block *, BlockSet> BlockBlockSetMap;
-typedef std::list<Block *> BlockList;
-
-template <class T, class U>
-static bool contains(const T &container, const U &contained) {
-  return container.count(contained);
-}
-
-
-// Branch
-
-Branch::Branch(const char *ConditionInit, const char *CodeInit)
-    : Ancestor(nullptr), Labeled(true) {
-  // FIXME: move from char* to LLVM data structures
-  Condition = ConditionInit ? strdup(ConditionInit) : nullptr;
-  Code = CodeInit ? strdup(CodeInit) : nullptr;
-}
-
-Branch::~Branch() {
-  // FIXME: move from char* to LLVM data structures
-  free(static_cast<void *>(const_cast<char *>(Condition)));
-  free(static_cast<void *>(const_cast<char *>(Code)));
-}
-
-// Block
-
-Block::Block(const char *CodeInit, const char *BranchVarInit)
-    : Parent(nullptr), Id(-1), IsCheckedMultipleEntry(false) {
-  // FIXME: move from char* to LLVM data structures
-  Code = strdup(CodeInit);
-  BranchVar = BranchVarInit ? strdup(BranchVarInit) : nullptr;
-}
-
-Block::~Block() {
-  // FIXME: move from char* to LLVM data structures
-  free(static_cast<void *>(const_cast<char *>(Code)));
-  free(static_cast<void *>(const_cast<char *>(BranchVar)));
-}
-
-void Block::AddBranchTo(Block *Target, const char *Condition,
-                        const char *Code) {
-  assert(!contains(BranchesOut, Target) &&
-         "cannot add more than one branch to the same target");
-  BranchesOut[Target] = make_unique<Branch>(Condition, Code);
-}
-
-// Relooper
-
-RelooperAlgorithm::RelooperAlgorithm()
-    : Root(nullptr), MinSize(false), BlockIdCounter(1),
-      ShapeIdCounter(0) { // block ID 0 is reserved for clearings
-}
-
-RelooperAlgorithm::~RelooperAlgorithm() {
-  for (auto Curr : Blocks)
-    delete Curr;
-  for (auto Curr : Shapes)
-    delete Curr;
-}
-
-void RelooperAlgorithm::AddBlock(Block *New, int Id) {
-  New->Id = Id == -1 ? BlockIdCounter++ : Id;
-  Blocks.push_back(New);
-}
-
-struct RelooperRecursor {
-  RelooperAlgorithm *Parent;
-  RelooperRecursor(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {}
-};
-
-void RelooperAlgorithm::Calculate(Block *Entry) {
-  // Scan and optimize the input
-  struct PreOptimizer : public RelooperRecursor {
-    PreOptimizer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {}
-    BlockSet Live;
-
-    void FindLive(Block *Root) {
-      BlockList ToInvestigate;
-      ToInvestigate.push_back(Root);
-      while (!ToInvestigate.empty()) {
-        Block *Curr = ToInvestigate.front();
-        ToInvestigate.pop_front();
-        if (contains(Live, Curr))
-          continue;
-        Live.insert(Curr);
-        for (const auto &iter : Curr->BranchesOut)
-          ToInvestigate.push_back(iter.first);
-      }
-    }
-
-    // If a block has multiple entries but no exits, and it is small enough, it
-    // is useful to split it. A common example is a C++ function where
-    // everything ends up at a final exit block and does some RAII cleanup.
-    // Without splitting, we will be forced to introduce labelled loops to
-    // allow reaching the final block
-    void SplitDeadEnds() {
-      unsigned TotalCodeSize = 0;
-      for (const auto &Curr : Live) {
-        TotalCodeSize += strlen(Curr->Code);
-      }
-      BlockSet Splits;
-      BlockSet Removed;
-      for (const auto &Original : Live) {
-        if (Original->BranchesIn.size() <= 1 ||
-            !Original->BranchesOut.empty())
-          continue; // only dead ends, for now
-        if (contains(Original->BranchesOut, Original))
-          continue; // cannot split a looping node
-        if (strlen(Original->Code) * (Original->BranchesIn.size() - 1) >
-            TotalCodeSize / RelooperSplittingFactor)
-          continue; // if splitting increases raw code size by a significant
-                    // amount, abort
-        // Split the node (for simplicity, we replace all the blocks, even
-        // though we could have reused the original)
-        DEBUG(dbgs() << "  Splitting '" << Original->Code << "'\n");
-        for (const auto &Prior : Original->BranchesIn) {
-          Block *Split = new Block(Original->Code, Original->BranchVar);
-          Parent->AddBlock(Split, Original->Id);
-          Split->BranchesIn.insert(Prior);
-          std::unique_ptr<Branch> Details;
-          Details.swap(Prior->BranchesOut[Original]);
-          Prior->BranchesOut[Split] = make_unique<Branch>(Details->Condition,
-                                                          Details->Code);
-          for (const auto &iter : Original->BranchesOut) {
-            Block *Post = iter.first;
-            Branch *Details = iter.second.get();
-            Split->BranchesOut[Post] = make_unique<Branch>(Details->Condition,
-                                                           Details->Code);
-            Post->BranchesIn.insert(Split);
-          }
-          Splits.insert(Split);
-          Removed.insert(Original);
-        }
-        for (const auto &iter : Original->BranchesOut) {
-          Block *Post = iter.first;
-          Post->BranchesIn.remove(Original);
-        }
-      }
-      for (const auto &iter : Splits)
-        Live.insert(iter);
-      for (const auto &iter : Removed)
-        Live.remove(iter);
-    }
-  };
-  PreOptimizer Pre(this);
-  Pre.FindLive(Entry);
-
-  // Add incoming branches from live blocks, ignoring dead code
-  for (unsigned i = 0; i < Blocks.size(); i++) {
-    Block *Curr = Blocks[i];
-    if (!contains(Pre.Live, Curr))
-      continue;
-    for (const auto &iter : Curr->BranchesOut)
-      iter.first->BranchesIn.insert(Curr);
-  }
-
-  if (!MinSize)
-    Pre.SplitDeadEnds();
-
-  // Recursively process the graph
-
-  struct Analyzer : public RelooperRecursor {
-    Analyzer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {}
-
-    // Add a shape to the list of shapes in this Relooper calculation
-    void Notice(Shape *New) {
-      New->Id = Parent->ShapeIdCounter++;
-      Parent->Shapes.push_back(New);
-    }
-
-    // Create a list of entries from a block. If LimitTo is provided, only
-    // results in that set will appear
-    void GetBlocksOut(Block *Source, BlockSet &Entries,
-                      BlockSet *LimitTo = nullptr) {
-      for (const auto &iter : Source->BranchesOut)
-        if (!LimitTo || contains(*LimitTo, iter.first))
-          Entries.insert(iter.first);
-    }
-
-    // Converts/processes all branchings to a specific target
-    void Solipsize(Block *Target, Branch::FlowType Type, Shape *Ancestor,
-                   BlockSet &From) {
-      DEBUG(dbgs() << "  Solipsize '" << Target->Code << "' type " << Type
-                   << "\n");
-      for (auto iter = Target->BranchesIn.begin();
-           iter != Target->BranchesIn.end();) {
-        Block *Prior = *iter;
-        if (!contains(From, Prior)) {
-          iter++;
-          continue;
-        }
-        std::unique_ptr<Branch> PriorOut;
-        PriorOut.swap(Prior->BranchesOut[Target]);
-        PriorOut->Ancestor = Ancestor;
-        PriorOut->Type = Type;
-        if (MultipleShape *Multiple = dyn_cast<MultipleShape>(Ancestor))
-          Multiple->Breaks++; // We are breaking out of this Multiple, so need a
-                              // loop
-        iter++; // carefully increment iter before erasing
-        Target->BranchesIn.remove(Prior);
-        Target->ProcessedBranchesIn.insert(Prior);
-        Prior->ProcessedBranchesOut[Target].swap(PriorOut);
-      }
-    }
-
-    Shape *MakeSimple(BlockSet &Blocks, Block *Inner, BlockSet &NextEntries) {
-      DEBUG(dbgs() << "  MakeSimple inner block '" << Inner->Code << "'\n");
-      SimpleShape *Simple = new SimpleShape;
-      Notice(Simple);
-      Simple->Inner = Inner;
-      Inner->Parent = Simple;
-      if (Blocks.size() > 1) {
-        Blocks.remove(Inner);
-        GetBlocksOut(Inner, NextEntries, &Blocks);
-        BlockSet JustInner;
-        JustInner.insert(Inner);
-        for (const auto &iter : NextEntries)
-          Solipsize(iter, Branch::Direct, Simple, JustInner);
-      }
-      return Simple;
-    }
-
-    Shape *MakeLoop(BlockSet &Blocks, BlockSet &Entries,
-                    BlockSet &NextEntries) {
-      // Find the inner blocks in this loop. Proceed backwards from the entries
-      // until
-      // you reach a seen block, collecting as you go.
-      BlockSet InnerBlocks;
-      BlockSet Queue = Entries;
-      while (!Queue.empty()) {
-        Block *Curr = *(Queue.begin());
-        Queue.remove(*Queue.begin());
-        if (!contains(InnerBlocks, Curr)) {
-          // This element is new, mark it as inner and remove from outer
-          InnerBlocks.insert(Curr);
-          Blocks.remove(Curr);
-          // Add the elements prior to it
-          for (const auto &iter : Curr->BranchesIn)
-            Queue.insert(iter);
-        }
-      }
-      assert(!InnerBlocks.empty());
-
-      for (const auto &Curr : InnerBlocks) {
-        for (const auto &iter : Curr->BranchesOut) {
-          Block *Possible = iter.first;
-          if (!contains(InnerBlocks, Possible))
-            NextEntries.insert(Possible);
-        }
-      }
-
-      LoopShape *Loop = new LoopShape();
-      Notice(Loop);
-
-      // Solipsize the loop, replacing with break/continue and marking branches
-      // as Processed (will not affect later calculations)
-      // A. Branches to the loop entries become a continue to this shape
-      for (const auto &iter : Entries)
-        Solipsize(iter, Branch::Continue, Loop, InnerBlocks);
-      // B. Branches to outside the loop (a next entry) become breaks on this
-      // shape
-      for (const auto &iter : NextEntries)
-        Solipsize(iter, Branch::Break, Loop, InnerBlocks);
-      // Finish up
-      Shape *Inner = Process(InnerBlocks, Entries, nullptr);
-      Loop->Inner = Inner;
-      return Loop;
-    }
-
-    // For each entry, find the independent group reachable by it. The
-    // independent group is the entry itself, plus all the blocks it can
-    // reach that cannot be directly reached by another entry. Note that we
-    // ignore directly reaching the entry itself by another entry.
-    //   @param Ignore - previous blocks that are irrelevant
-    void FindIndependentGroups(BlockSet &Entries,
-                               BlockBlockSetMap &IndependentGroups,
-                               BlockSet *Ignore = nullptr) {
-      typedef std::map<Block *, Block *> BlockBlockMap;
-
-      struct HelperClass {
-        BlockBlockSetMap &IndependentGroups;
-        BlockBlockMap Ownership; // For each block, which entry it belongs to.
-                                 // We have reached it from there.
-
-        HelperClass(BlockBlockSetMap &IndependentGroupsInit)
-            : IndependentGroups(IndependentGroupsInit) {}
-        void InvalidateWithChildren(Block *New) {
-          // Being in the list means you need to be invalidated
-          BlockList ToInvalidate;
-          ToInvalidate.push_back(New);
-          while (!ToInvalidate.empty()) {
-            Block *Invalidatee = ToInvalidate.front();
-            ToInvalidate.pop_front();
-            Block *Owner = Ownership[Invalidatee];
-            // Owner may have been invalidated, do not add to
-            // IndependentGroups!
-            if (contains(IndependentGroups, Owner))
-              IndependentGroups[Owner].remove(Invalidatee);
-            if (Ownership[Invalidatee]) { // may have been seen before and
-                                          // invalidated already
-              Ownership[Invalidatee] = nullptr;
-              for (const auto &iter : Invalidatee->BranchesOut) {
-                Block *Target = iter.first;
-                BlockBlockMap::iterator Known = Ownership.find(Target);
-                if (Known != Ownership.end()) {
-                  Block *TargetOwner = Known->second;
-                  if (TargetOwner)
-                    ToInvalidate.push_back(Target);
-                }
-              }
-            }
-          }
-        }
-      };
-      HelperClass Helper(IndependentGroups);
-
-      // We flow out from each of the entries, simultaneously.
-      // When we reach a new block, we add it as belonging to the one we got to
-      // it from.
-      // If we reach a new block that is already marked as belonging to someone,
-      // it is reachable by two entries and is not valid for any of them.
-      // Remove it and all it can reach that have been visited.
-
-      // Being in the queue means we just added this item, and
-      // we need to add its children
-      BlockList Queue;
-      for (const auto &Entry : Entries) {
-        Helper.Ownership[Entry] = Entry;
-        IndependentGroups[Entry].insert(Entry);
-        Queue.push_back(Entry);
-      }
-      while (!Queue.empty()) {
-        Block *Curr = Queue.front();
-        Queue.pop_front();
-        Block *Owner = Helper.Ownership[Curr]; // Curr must be in the ownership
-                                               // map if we are in the queue
-        if (!Owner)
-          continue; // we have been invalidated meanwhile after being reached
-                    // from two entries
-        // Add all children
-        for (const auto &iter : Curr->BranchesOut) {
-          Block *New = iter.first;
-          BlockBlockMap::iterator Known = Helper.Ownership.find(New);
-          if (Known == Helper.Ownership.end()) {
-            // New node. Add it, and put it in the queue
-            Helper.Ownership[New] = Owner;
-            IndependentGroups[Owner].insert(New);
-            Queue.push_back(New);
-            continue;
-          }
-          Block *NewOwner = Known->second;
-          if (!NewOwner)
-            continue; // We reached an invalidated node
-          if (NewOwner != Owner)
-            // Invalidate this and all reachable that we have seen - we reached
-            // this from two locations
-            Helper.InvalidateWithChildren(New);
-          // otherwise, we have the same owner, so do nothing
-        }
-      }
-
-      // Having processed all the interesting blocks, we remain with just one
-      // potential issue:
-      // If a->b, and a was invalidated, but then b was later reached by
-      // someone else, we must invalidate b. To check for this, we go over all
-      // elements in the independent groups, if an element has a parent which
-      // does *not* have the same owner, we/ must remove it and all its
-      // children.
-
-      for (const auto &iter : Entries) {
-        BlockSet &CurrGroup = IndependentGroups[iter];
-        BlockList ToInvalidate;
-        for (const auto &iter : CurrGroup) {
-          Block *Child = iter;
-          for (const auto &iter : Child->BranchesIn) {
-            Block *Parent = iter;
-            if (Ignore && contains(*Ignore, Parent))
-              continue;
-            if (Helper.Ownership[Parent] != Helper.Ownership[Child])
-              ToInvalidate.push_back(Child);
-          }
-        }
-        while (!ToInvalidate.empty()) {
-          Block *Invalidatee = ToInvalidate.front();
-          ToInvalidate.pop_front();
-          Helper.InvalidateWithChildren(Invalidatee);
-        }
-      }
-
-      // Remove empty groups
-      for (const auto &iter : Entries)
-        if (IndependentGroups[iter].empty())
-          IndependentGroups.erase(iter);
-    }
-
-    Shape *MakeMultiple(BlockSet &Blocks, BlockSet &Entries,
-                        BlockBlockSetMap &IndependentGroups, Shape *Prev,
-                        BlockSet &NextEntries) {
-      bool Fused = isa<SimpleShape>(Prev);
-      MultipleShape *Multiple = new MultipleShape();
-      Notice(Multiple);
-      BlockSet CurrEntries;
-      for (auto &iter : IndependentGroups) {
-        Block *CurrEntry = iter.first;
-        BlockSet &CurrBlocks = iter.second;
-        // Create inner block
-        CurrEntries.clear();
-        CurrEntries.insert(CurrEntry);
-        for (const auto &CurrInner : CurrBlocks) {
-          // Remove the block from the remaining blocks
-          Blocks.remove(CurrInner);
-          // Find new next entries and fix branches to them
-          for (auto iter = CurrInner->BranchesOut.begin();
-               iter != CurrInner->BranchesOut.end();) {
-            Block *CurrTarget = iter->first;
-            auto Next = iter;
-            Next++;
-            if (!contains(CurrBlocks, CurrTarget)) {
-              NextEntries.insert(CurrTarget);
-              Solipsize(CurrTarget, Branch::Break, Multiple, CurrBlocks);
-            }
-            iter = Next; // increment carefully because Solipsize can remove us
-          }
-        }
-        Multiple->InnerMap[CurrEntry->Id] =
-            Process(CurrBlocks, CurrEntries, nullptr);
-        // If we are not fused, then our entries will actually be checked
-        if (!Fused)
-          CurrEntry->IsCheckedMultipleEntry = true;
-      }
-      // Add entries not handled as next entries, they are deferred
-      for (const auto &Entry : Entries)
-        if (!contains(IndependentGroups, Entry))
-          NextEntries.insert(Entry);
-      // The multiple has been created, we can decide how to implement it
-      if (Multiple->InnerMap.size() >= RelooperMultipleSwitchThreshold) {
-        Multiple->UseSwitch = true;
-        Multiple->Breaks++; // switch captures breaks
-      }
-      return Multiple;
-    }
-
-    // Main function.
-    // Process a set of blocks with specified entries, returns a shape
-    // The Make* functions receive a NextEntries. If they fill it with data,
-    // those are the entries for the ->Next block on them, and the blocks
-    // are what remains in Blocks (which Make* modify). In this way
-    // we avoid recursing on Next (imagine a long chain of Simples, if we
-    // recursed we could blow the stack).
-    Shape *Process(BlockSet &Blocks, BlockSet &InitialEntries, Shape *Prev) {
-      BlockSet *Entries = &InitialEntries;
-      BlockSet TempEntries[2];
-      int CurrTempIndex = 0;
-      BlockSet *NextEntries;
-      Shape *Ret = nullptr;
-
-      auto Make = [&](Shape *Temp) {
-        if (Prev)
-          Prev->Next = Temp;
-        if (!Ret)
-          Ret = Temp;
-        Prev = Temp;
-        Entries = NextEntries;
-      };
-
-      while (1) {
-        CurrTempIndex = 1 - CurrTempIndex;
-        NextEntries = &TempEntries[CurrTempIndex];
-        NextEntries->clear();
-
-        if (Entries->empty())
-          return Ret;
-        if (Entries->size() == 1) {
-          Block *Curr = *(Entries->begin());
-          if (Curr->BranchesIn.empty()) {
-            // One entry, no looping ==> Simple
-            Make(MakeSimple(Blocks, Curr, *NextEntries));
-            if (NextEntries->empty())
-              return Ret;
-            continue;
-          }
-          // One entry, looping ==> Loop
-          Make(MakeLoop(Blocks, *Entries, *NextEntries));
-          if (NextEntries->empty())
-            return Ret;
-          continue;
-        }
-
-        // More than one entry, try to eliminate through a Multiple groups of
-        // independent blocks from an entry/ies. It is important to remove
-        // through multiples as opposed to looping since the former is more
-        // performant.
-        BlockBlockSetMap IndependentGroups;
-        FindIndependentGroups(*Entries, IndependentGroups);
-
-        if (!IndependentGroups.empty()) {
-          // We can handle a group in a multiple if its entry cannot be reached
-          // by another group.
-          // Note that it might be reachable by itself - a loop. But that is
-          // fine, we will create a loop inside the multiple block (which
-          // is the performant order to do it).
-          for (auto iter = IndependentGroups.begin();
-               iter != IndependentGroups.end();) {
-            Block *Entry = iter->first;
-            BlockSet &Group = iter->second;
-            auto curr = iter++; // iterate carefully, we may delete
-            for (BlockSet::iterator iterBranch = Entry->BranchesIn.begin();
-                 iterBranch != Entry->BranchesIn.end(); iterBranch++) {
-              Block *Origin = *iterBranch;
-              if (!contains(Group, Origin)) {
-                // Reached from outside the group, so we cannot handle this
-                IndependentGroups.erase(curr);
-                break;
-              }
-            }
-          }
-
-          // As an optimization, if we have 2 independent groups, and one is a
-          // small dead end, we can handle only that dead end.
-          // The other then becomes a Next - without nesting in the code and
-          // recursion in the analysis.
-          // TODO: if the larger is the only dead end, handle that too
-          // TODO: handle >2 groups
-          // TODO: handle not just dead ends, but also that do not branch to the
-          // NextEntries. However, must be careful there since we create a
-          // Next, and that Next can prevent eliminating a break (since we no
-          // longer naturally reach the same place), which may necessitate a
-          // one-time loop, which makes the unnesting pointless.
-          if (IndependentGroups.size() == 2) {
-            // Find the smaller one
-            auto iter = IndependentGroups.begin();
-            Block *SmallEntry = iter->first;
-            auto SmallSize = iter->second.size();
-            iter++;
-            Block *LargeEntry = iter->first;
-            auto LargeSize = iter->second.size();
-            if (SmallSize != LargeSize) { // ignore the case where they are
-                                          // identical - keep things symmetrical
-                                          // there
-              if (SmallSize > LargeSize) {
-                Block *Temp = SmallEntry;
-                SmallEntry = LargeEntry;
-                LargeEntry = Temp; // Note: we did not flip the Sizes too, they
-                                   // are now invalid. TODO: use the smaller
-                                   // size as a limit?
-              }
-              // Check if dead end
-              bool DeadEnd = true;
-              BlockSet &SmallGroup = IndependentGroups[SmallEntry];
-              for (const auto &Curr : SmallGroup) {
-                for (const auto &iter : Curr->BranchesOut) {
-                  Block *Target = iter.first;
-                  if (!contains(SmallGroup, Target)) {
-                    DeadEnd = false;
-                    break;
-                  }
-                }
-                if (!DeadEnd)
-                  break;
-              }
-              if (DeadEnd)
-                IndependentGroups.erase(LargeEntry);
-            }
-          }
-
-          if (!IndependentGroups.empty())
-            // Some groups removable ==> Multiple
-            Make(MakeMultiple(Blocks, *Entries, IndependentGroups, Prev,
-                              *NextEntries));
-            if (NextEntries->empty())
-              return Ret;
-            continue;
-        }
-        // No independent groups, must be loopable ==> Loop
-        Make(MakeLoop(Blocks, *Entries, *NextEntries));
-        if (NextEntries->empty())
-          return Ret;
-        continue;
-      }
-    }
-  };
-
-  // Main
-
-  BlockSet AllBlocks;
-  for (const auto &Curr : Pre.Live) {
-    AllBlocks.insert(Curr);
-  }
-
-  BlockSet Entries;
-  Entries.insert(Entry);
-  Root = Analyzer(this).Process(AllBlocks, Entries, nullptr);
-  assert(Root);
-
-  ///
-  /// Relooper post-optimizer
-  ///
-  struct PostOptimizer {
-    RelooperAlgorithm *Parent;
-    std::stack<Shape *> LoopStack;
-
-    PostOptimizer(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {}
-
-    void ShapeSwitch(Shape* var,
-                     std::function<void (SimpleShape*)> simple,
-                     std::function<void (MultipleShape*)> multiple,
-                     std::function<void (LoopShape*)> loop) {
-      switch (var->getKind()) {
-        case Shape::SK_Simple: {
-          simple(cast<SimpleShape>(var));
-          break;
-        }
-        case Shape::SK_Multiple: {
-          multiple(cast<MultipleShape>(var));
-          break;
-        }
-        case Shape::SK_Loop: {
-          loop(cast<LoopShape>(var));
-          break;
-        }
-      }
-    }
-
-    // Find the blocks that natural control flow can get us directly to, or
-    // through a multiple that we ignore
-    void FollowNaturalFlow(Shape *S, BlockSet &Out) {
-      ShapeSwitch(S, [&](SimpleShape* Simple) {
-        Out.insert(Simple->Inner);
-      }, [&](MultipleShape* Multiple) {
-        for (const auto &iter : Multiple->InnerMap) {
-          FollowNaturalFlow(iter.second, Out);
-        }
-        FollowNaturalFlow(Multiple->Next, Out);
-      }, [&](LoopShape* Loop) {
-        FollowNaturalFlow(Loop->Inner, Out);
-      });
-    }
-
-    void FindNaturals(Shape *Root, Shape *Otherwise = nullptr) {
-      if (Root->Next) {
-        Root->Natural = Root->Next;
-        FindNaturals(Root->Next, Otherwise);
-      } else {
-        Root->Natural = Otherwise;
-      }
-
-      ShapeSwitch(Root, [](SimpleShape* Simple) {
-      }, [&](MultipleShape* Multiple) {
-        for (const auto &iter : Multiple->InnerMap) {
-          FindNaturals(iter.second, Root->Natural);
-        }
-      }, [&](LoopShape* Loop){
-        FindNaturals(Loop->Inner, Loop->Inner);
-      });
-    }
-
-    // Remove unneeded breaks and continues.
-    // A flow operation is trivially unneeded if the shape we naturally get to
-    // by normal code execution is the same as the flow forces us to.
-    void RemoveUnneededFlows(Shape *Root, Shape *Natural = nullptr,
-                             LoopShape *LastLoop = nullptr,
-                             unsigned Depth = 0) {
-      BlockSet NaturalBlocks;
-      FollowNaturalFlow(Natural, NaturalBlocks);
-      Shape *Next = Root;
-      while (Next) {
-        Root = Next;
-        Next = nullptr;
-        ShapeSwitch(
-            Root,
-            [&](SimpleShape* Simple) {
-              if (Simple->Inner->BranchVar)
-                LastLoop =
-                    nullptr; // a switch clears out the loop (TODO: only for
-                             // breaks, not continue)
-
-              if (Simple->Next) {
-                if (!Simple->Inner->BranchVar &&
-                    Simple->Inner->ProcessedBranchesOut.size() == 2 &&
-                    Depth < RelooperNestingLimit) {
-                  // If there is a next block, we already know at Simple
-                  // creation time to make direct branches, and we can do
-                  // nothing more in general. But, we try to optimize the
-                  // case of a break and a direct: This would normally be
-                  //   if (break?) { break; } ..
-                  // but if we make sure to nest the else, we can save the
-                  // break,
-                  //   if (!break?) { .. }
-                  // This is also better because the more canonical nested
-                  // form is easier to further optimize later. The
-                  // downside is more nesting, which adds to size in builds with
-                  // whitespace.
-                  // Note that we avoid switches, as it complicates control flow
-                  // and is not relevant for the common case we optimize here.
-                  bool Found = false;
-                  bool Abort = false;
-                  for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
-                    Block *Target = iter.first;
-                    Branch *Details = iter.second.get();
-                    if (Details->Type == Branch::Break) {
-                      Found = true;
-                      if (!contains(NaturalBlocks, Target))
-                        Abort = true;
-                    } else if (Details->Type != Branch::Direct)
-                      Abort = true;
-                  }
-                  if (Found && !Abort) {
-                    for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
-                      Branch *Details = iter.second.get();
-                      if (Details->Type == Branch::Break) {
-                        Details->Type = Branch::Direct;
-                        if (MultipleShape *Multiple =
-                                dyn_cast<MultipleShape>(Details->Ancestor))
-                          Multiple->Breaks--;
-                      } else {
-                        assert(Details->Type == Branch::Direct);
-                        Details->Type = Branch::Nested;
-                      }
-                    }
-                  }
-                  Depth++; // this optimization increases depth, for us and all
-                           // our next chain (i.e., until this call returns)
-                }
-                Next = Simple->Next;
-              } else {
-                // If there is no next then Natural is where we will
-                // go to by doing nothing, so we can potentially optimize some
-                // branches to direct.
-                for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
-                  Block *Target = iter.first;
-                  Branch *Details = iter.second.get();
-                  if (Details->Type != Branch::Direct &&
-                      contains(NaturalBlocks,
-                               Target)) { // note: cannot handle split blocks
-                    Details->Type = Branch::Direct;
-                    if (MultipleShape *Multiple =
-                            dyn_cast<MultipleShape>(Details->Ancestor))
-                      Multiple->Breaks--;
-                  } else if (Details->Type == Branch::Break && LastLoop &&
-                             LastLoop->Natural == Details->Ancestor->Natural) {
-                    // it is important to simplify breaks, as simpler breaks
-                    // enable other optimizations
-                    Details->Labeled = false;
-                    if (MultipleShape *Multiple =
-                            dyn_cast<MultipleShape>(Details->Ancestor))
-                      Multiple->Breaks--;
-                  }
-                }
-              }
-            }, [&](MultipleShape* Multiple)
-            {
-              for (const auto &iter : Multiple->InnerMap) {
-                RemoveUnneededFlows(iter.second, Multiple->Next,
-                                    Multiple->Breaks ? nullptr : LastLoop,
-                                    Depth + 1);
-              }
-              Next = Multiple->Next;
-            }, [&](LoopShape* Loop)
-            {
-              RemoveUnneededFlows(Loop->Inner, Loop->Inner, Loop, Depth + 1);
-              Next = Loop->Next;
-            });
-      }
-    }
-
-    // After we know which loops exist, we can calculate which need to be
-    // labeled
-    void FindLabeledLoops(Shape *Root) {
-      Shape *Next = Root;
-      while (Next) {
-        Root = Next;
-        Next = nullptr;
-
-        ShapeSwitch(
-            Root,
-            [&](SimpleShape *Simple) {
-          MultipleShape *Fused = dyn_cast<MultipleShape>(Root->Next);
-          // If we are fusing a Multiple with a loop into this Simple, then
-          // visit it now
-          if (Fused && Fused->Breaks)
-            LoopStack.push(Fused);
-          if (Simple->Inner->BranchVar)
-            LoopStack.push(nullptr); // a switch means breaks are now useless,
-                                     // push a dummy
-          if (Fused) {
-            if (Fused->UseSwitch)
-              LoopStack.push(nullptr); // a switch means breaks are now
-                                       // useless, push a dummy
-            for (const auto &iter : Fused->InnerMap) {
-              FindLabeledLoops(iter.second);
-            }
-          }
-          for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
-            Branch *Details = iter.second.get();
-            if (Details->Type == Branch::Break ||
-                Details->Type == Branch::Continue) {
-              assert(!LoopStack.empty());
-              if (Details->Ancestor != LoopStack.top() && Details->Labeled) {
-                if (MultipleShape *Multiple =
-                        dyn_cast<MultipleShape>(Details->Ancestor)) {
-                  Multiple->Labeled = true;
-                } else {
-                  LoopShape *Loop = cast<LoopShape>(Details->Ancestor);
-                  Loop->Labeled = true;
-                }
-              } else {
-                Details->Labeled = false;
-              }
-            }
-            if (Fused && Fused->UseSwitch)
-              LoopStack.pop();
-            if (Simple->Inner->BranchVar)
-              LoopStack.pop();
-            if (Fused && Fused->Breaks)
-              LoopStack.pop();
-            if (Fused)
-              Next = Fused->Next;
-            else
-              Next = Root->Next;
-          }
-          }
-          , [&](MultipleShape* Multiple) {
-            if (Multiple->Breaks)
-              LoopStack.push(Multiple);
-            for (const auto &iter : Multiple->InnerMap)
-              FindLabeledLoops(iter.second);
-            if (Multiple->Breaks)
-              LoopStack.pop();
-            Next = Root->Next;
-          }
-          , [&](LoopShape* Loop) {
-            LoopStack.push(Loop);
-            FindLabeledLoops(Loop->Inner);
-            LoopStack.pop();
-            Next = Root->Next;
-          });
-      }
-    }
-
-    void Process(Shape * Root) {
-      FindNaturals(Root);
-      RemoveUnneededFlows(Root);
-      FindLabeledLoops(Root);
-    }
-  };
-
-  PostOptimizer(this).Process(Root);
-}
diff --git a/lib/Target/WebAssembly/Relooper.h b/lib/Target/WebAssembly/Relooper.h
deleted file mode 100644
index 7c564de82f34..000000000000
--- a/lib/Target/WebAssembly/Relooper.h
+++ /dev/null
@@ -1,186 +0,0 @@
-//===-- Relooper.h - Top-level interface for WebAssembly  ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===-------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This defines an optimized C++ implemention of the Relooper
-/// algorithm, originally developed as part of Emscripten, which
-/// generates a structured AST from arbitrary control flow.
-///
-//===-------------------------------------------------------------------===//
-
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/Support/Casting.h"
-
-#include <cassert>
-#include <cstdarg>
-#include <cstdio>
-#include <deque>
-#include <list>
-#include <map>
-#include <memory>
-#include <set>
-
-namespace llvm {
-
-namespace Relooper {
-
-struct Block;
-struct Shape;
-
-///
-/// Info about a branching from one block to another
-///
-struct Branch {
-  enum FlowType {
-    Direct = 0, // We will directly reach the right location through other
-                // means, no need for continue or break
-    Break = 1,
-    Continue = 2,
-    Nested = 3 // This code is directly reached, but we must be careful to
-               // ensure it is nested in an if - it is not reached
-    // unconditionally, other code paths exist alongside it that we need to make
-    // sure do not intertwine
-  };
-  Shape
-      *Ancestor; // If not nullptr, this shape is the relevant one for purposes
-                 // of getting to the target block. We break or continue on it
-  Branch::FlowType
-      Type;     // If Ancestor is not nullptr, this says whether to break or
-                // continue
-  bool Labeled; // If a break or continue, whether we need to use a label
-  const char *Condition; // The condition for which we branch. For example,
-                         // "my_var == 1". Conditions are checked one by one.
-                         // One of the conditions should have nullptr as the
-                         // condition, in which case it is the default
-                         // FIXME: move from char* to LLVM data structures
-  const char *Code; // If provided, code that is run right before the branch is
-                    // taken. This is useful for phis
-                    // FIXME: move from char* to LLVM data structures
-
-  Branch(const char *ConditionInit, const char *CodeInit = nullptr);
-  ~Branch();
-};
-
-typedef SetVector<Block *> BlockSet;
-typedef MapVector<Block *, Branch *> BlockBranchMap;
-typedef MapVector<Block *, std::unique_ptr<Branch>> OwningBlockBranchMap;
-
-///
-/// Represents a basic block of code - some instructions that end with a
-/// control flow modifier (a branch, return or throw).
-///
-struct Block {
-  // Branches become processed after we finish the shape relevant to them. For
-  // example, when we recreate a loop, branches to the loop start become
-  // continues and are now processed. When we calculate what shape to generate
-  // from a set of blocks, we ignore processed branches. Blocks own the Branch
-  // objects they use, and destroy them when done.
-  OwningBlockBranchMap BranchesOut;
-  BlockSet BranchesIn;
-  OwningBlockBranchMap ProcessedBranchesOut;
-  BlockSet ProcessedBranchesIn;
-  Shape *Parent; // The shape we are directly inside
-  int Id; // A unique identifier, defined when added to relooper. Note that this
-          // uniquely identifies a *logical* block - if we split it, the two
-          // instances have the same content *and* the same Id
-  const char *Code;      // The string representation of the code in this block.
-                         // Owning pointer (we copy the input)
-                         // FIXME: move from char* to LLVM data structures
-  const char *BranchVar; // A variable whose value determines where we go; if
-                         // this is not nullptr, emit a switch on that variable
-                         // FIXME: move from char* to LLVM data structures
-  bool IsCheckedMultipleEntry; // If true, we are a multiple entry, so reaching
-                               // us requires setting the label variable
-
-  Block(const char *CodeInit, const char *BranchVarInit);
-  ~Block();
-
-  void AddBranchTo(Block *Target, const char *Condition,
-                   const char *Code = nullptr);
-};
-
-///
-/// Represents a structured control flow shape
-///
-struct Shape {
-  int Id; // A unique identifier. Used to identify loops, labels are Lx where x
-          // is the Id. Defined when added to relooper
-  Shape *Next;    // The shape that will appear in the code right after this one
-  Shape *Natural; // The shape that control flow gets to naturally (if there is
-                  // Next, then this is Next)
-
-  /// Discriminator for LLVM-style RTTI (dyn_cast<> et al.)
-  enum ShapeKind { SK_Simple, SK_Multiple, SK_Loop };
-
-private:
-  ShapeKind Kind;
-
-public:
-  ShapeKind getKind() const { return Kind; }
-
-  Shape(ShapeKind KindInit) : Id(-1), Next(nullptr), Kind(KindInit) {}
-};
-
-///
-/// Simple: No control flow at all, just instructions.
-///
-struct SimpleShape : public Shape {
-  Block *Inner;
-
-  SimpleShape() : Shape(SK_Simple), Inner(nullptr) {}
-
-  static bool classof(const Shape *S) { return S->getKind() == SK_Simple; }
-};
-
-///
-/// A shape that may be implemented with a labeled loop.
-///
-struct LabeledShape : public Shape {
-  bool Labeled; // If we have a loop, whether it needs to be labeled
-
-  LabeledShape(ShapeKind KindInit) : Shape(KindInit), Labeled(false) {}
-};
-
-// Blocks with the same id were split and are identical, so we just care about
-// ids in Multiple entries
-typedef std::map<int, Shape *> IdShapeMap;
-
-///
-/// Multiple: A shape with more than one entry. If the next block to
-///           be entered is among them, we run it and continue to
-///           the next shape, otherwise we continue immediately to the
-///           next shape.
-///
-struct MultipleShape : public LabeledShape {
-  IdShapeMap InnerMap; // entry block ID -> shape
-  int Breaks; // If we have branches on us, we need a loop (or a switch). This
-              // is a counter of requirements,
-              // if we optimize it to 0, the loop is unneeded
-  bool UseSwitch; // Whether to switch on label as opposed to an if-else chain
-
-  MultipleShape() : LabeledShape(SK_Multiple), Breaks(0), UseSwitch(false) {}
-
-  static bool classof(const Shape *S) { return S->getKind() == SK_Multiple; }
-};
-
-///
-/// Loop: An infinite loop.
-///
-struct LoopShape : public LabeledShape {
-  Shape *Inner;
-
-  LoopShape() : LabeledShape(SK_Loop), Inner(nullptr) {}
-
-  static bool classof(const Shape *S) { return S->getKind() == SK_Loop; }
-};
-
-} // namespace Relooper
-
-} // namespace llvm
diff --git a/lib/Target/WebAssembly/TargetInfo/Makefile b/lib/Target/WebAssembly/TargetInfo/Makefile
deleted file mode 100644
index b021eb6d9455..000000000000
--- a/lib/Target/WebAssembly/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/WebAssembly/TargetInfo/Makefile ----------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMWebAssemblyInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index e972da5af74f..957f31cae222 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -23,23 +23,28 @@ namespace llvm {
 class WebAssemblyTargetMachine;
 class FunctionPass;
 
+// LLVM IR passes.
 FunctionPass *createWebAssemblyOptimizeReturned();
 
+// ISel and immediate followup passes.
 FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
                                        CodeGenOpt::Level OptLevel);
 FunctionPass *createWebAssemblyArgumentMove();
+FunctionPass *createWebAssemblySetP2AlignOperands();
 
+// Late passes.
+FunctionPass *createWebAssemblyReplacePhysRegs();
+FunctionPass *createWebAssemblyPrepareForLiveIntervals();
+FunctionPass *createWebAssemblyOptimizeLiveIntervals();
 FunctionPass *createWebAssemblyStoreResults();
 FunctionPass *createWebAssemblyRegStackify();
 FunctionPass *createWebAssemblyRegColoring();
-FunctionPass *createWebAssemblyPEI();
+FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
 FunctionPass *createWebAssemblyCFGStackify();
 FunctionPass *createWebAssemblyLowerBrUnless();
 FunctionPass *createWebAssemblyRegNumbering();
 FunctionPass *createWebAssemblyPeephole();
 
-FunctionPass *createWebAssemblyRelooper();
-
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
index 3893c408cf63..5887f45371fc 100644
--- a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -65,8 +65,8 @@ FunctionPass *llvm::createWebAssemblyArgumentMove() {
 }
 
 /// Test whether the given instruction is an ARGUMENT.
-static bool IsArgument(const MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static bool IsArgument(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   case WebAssembly::ARGUMENT_I32:
   case WebAssembly::ARGUMENT_I64:
   case WebAssembly::ARGUMENT_F32:
@@ -88,20 +88,18 @@ bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) {
   MachineBasicBlock::iterator InsertPt = EntryMBB.end();
 
   // Look for the first NonArg instruction.
-  for (auto MII = EntryMBB.begin(), MIE = EntryMBB.end(); MII != MIE; ++MII) {
-    MachineInstr *MI = MII;
+  for (MachineInstr &MI : EntryMBB) {
     if (!IsArgument(MI)) {
-      InsertPt = MII;
+      InsertPt = MI;
       break;
     }
   }
 
   // Now move any argument instructions later in the block
   // to before our first NonArg instruction.
-  for (auto I = InsertPt, E = EntryMBB.end(); I != E; ++I) {
-    MachineInstr *MI = I;
+  for (MachineInstr &MI : llvm::make_range(InsertPt, EntryMBB.end())) {
     if (IsArgument(MI)) {
-      EntryMBB.insert(InsertPt, MI->removeFromParent());
+      EntryMBB.insert(InsertPt, MI.removeFromParent());
       Changed = true;
     }
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 45ac99d90ed9..54e9f7f52901 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -67,6 +67,7 @@ private:
   // AsmPrinter Implementation.
   //===------------------------------------------------------------------===//
 
+  void EmitEndOfAsmFile(Module &M) override;
   void EmitJumpTableInfo() override;
   void EmitConstantPool() override;
   void EmitFunctionBodyStart() override;
@@ -93,10 +94,7 @@ private:
 //===----------------------------------------------------------------------===//
 
 MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
-  const TargetRegisterClass *TRC =
-      TargetRegisterInfo::isVirtualRegister(RegNo)
-          ? MRI->getRegClass(RegNo)
-          : MRI->getTargetRegisterInfo()->getMinimalPhysRegClass(RegNo);
+  const TargetRegisterClass *TRC = MRI->getRegClass(RegNo);
   for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
     if (TRC->hasType(T))
       return T;
@@ -119,8 +117,7 @@ std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) {
   return '$' + utostr(WAReg);
 }
 
-WebAssemblyTargetStreamer *
-WebAssemblyAsmPrinter::getTargetStreamer() {
+WebAssemblyTargetStreamer *WebAssemblyAsmPrinter::getTargetStreamer() {
   MCTargetStreamer *TS = OutStreamer->getTargetStreamer();
   return static_cast<WebAssemblyTargetStreamer *>(TS);
 }
@@ -128,16 +125,6 @@ WebAssemblyAsmPrinter::getTargetStreamer() {
 //===----------------------------------------------------------------------===//
 // WebAssemblyAsmPrinter Implementation.
 //===----------------------------------------------------------------------===//
-
-void WebAssemblyAsmPrinter::EmitConstantPool() {
-  assert(MF->getConstantPool()->getConstants().empty() &&
-         "WebAssembly disables constant pools");
-}
-
-void WebAssemblyAsmPrinter::EmitJumpTableInfo() {
-  // Nothing to do; jump tables are incorporated into the instruction stream.
-}
-
 static void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
                                  Type *Ty, SmallVectorImpl<MVT> &ValueVTs) {
   const DataLayout &DL(F.getParent()->getDataLayout());
@@ -154,6 +141,42 @@ static void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
   }
 }
 
+void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  for (const auto &F : M) {
+    // Emit function type info for all undefined functions
+    if (F.isDeclarationForLinker() && !F.isIntrinsic()) {
+      SmallVector<MVT, 4> SignatureVTs;
+      ComputeLegalValueVTs(F, TM, F.getReturnType(), SignatureVTs);
+      size_t NumResults = SignatureVTs.size();
+      if (SignatureVTs.size() > 1) {
+        // WebAssembly currently can't lower returns of multiple values without
+        // demoting to sret (see WebAssemblyTargetLowering::CanLowerReturn). So
+        // replace multiple return values with a pointer parameter.
+        SignatureVTs.clear();
+        SignatureVTs.push_back(
+            MVT::getIntegerVT(M.getDataLayout().getPointerSizeInBits()));
+        NumResults = 0;
+      }
+
+      for (auto &Arg : F.args()) {
+        ComputeLegalValueVTs(F, TM, Arg.getType(), SignatureVTs);
+      }
+
+      getTargetStreamer()->emitIndirectFunctionType(F.getName(), SignatureVTs,
+                                                    NumResults);
+    }
+  }
+}
+
+void WebAssemblyAsmPrinter::EmitConstantPool() {
+  assert(MF->getConstantPool()->getConstants().empty() &&
+         "WebAssembly disables constant pools");
+}
+
+void WebAssemblyAsmPrinter::EmitJumpTableInfo() {
+  // Nothing to do; jump tables are incorporated into the instruction stream.
+}
+
 void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
   if (!MFI->getParams().empty())
     getTargetStreamer()->emitParam(MFI->getParams());
@@ -184,13 +207,6 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
     LocalTypes.push_back(getRegType(VReg));
     AnyWARegs = true;
   }
-  auto &PhysRegs = MFI->getPhysRegs();
-  for (unsigned PReg = 0; PReg < PhysRegs.size(); ++PReg) {
-    if (PhysRegs[PReg] == -1U)
-      continue;
-    LocalTypes.push_back(getRegType(PReg));
-    AnyWARegs = true;
-  }
   if (AnyWARegs)
     getTargetStreamer()->emitLocal(LocalTypes);
 
@@ -212,6 +228,30 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // These represent values which are live into the function entry, so there's
     // no instruction to emit.
     break;
+  case WebAssembly::FALLTHROUGH_RETURN_I32:
+  case WebAssembly::FALLTHROUGH_RETURN_I64:
+  case WebAssembly::FALLTHROUGH_RETURN_F32:
+  case WebAssembly::FALLTHROUGH_RETURN_F64: {
+    // These instructions represent the implicit return at the end of a
+    // function body. The operand is always a pop.
+    assert(MFI->isVRegStackified(MI->getOperand(0).getReg()));
+
+    if (isVerbose()) {
+      OutStreamer->AddComment("fallthrough-return: $pop" +
+                              utostr(MFI->getWARegStackId(
+                                  MFI->getWAReg(MI->getOperand(0).getReg()))));
+      OutStreamer->AddBlankLine();
+    }
+    break;
+  }
+  case WebAssembly::FALLTHROUGH_RETURN_VOID:
+    // This instruction represents the implicit return at the end of a
+    // function body with no return value.
+    if (isVerbose()) {
+      OutStreamer->AddComment("fallthrough-return");
+      OutStreamer->AddBlankLine();
+    }
+    break;
   default: {
     WebAssemblyMCInstLower MCInstLowering(OutContext, *this);
     MCInst TmpInst;
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index a39349c562fd..33166f5b554f 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -10,10 +10,10 @@
 /// \file
 /// \brief This file implements a CFG stacking pass.
 ///
-/// This pass reorders the blocks in a function to put them into a reverse
-/// post-order [0], with special care to keep the order as similar as possible
-/// to the original order, and to keep loops contiguous even in the case of
-/// split backedges.
+/// This pass reorders the blocks in a function to put them into topological
+/// order, ignoring loop backedges, and without any loop being interrupted
+/// by a block not dominated by the loop header, with special care to keep the
+/// order as similar as possible to the original order.
 ///
 /// Then, it inserts BLOCK and LOOP markers to mark the start of scopes, since
 /// scope boundaries serve as the labels for WebAssembly's control transfers.
@@ -21,14 +21,13 @@
 /// This is sufficient to convert arbitrary CFGs into a form that works on
 /// WebAssembly, provided that all loops are single-entry.
 ///
-/// [0] https://en.wikipedia.org/wiki/Depth-first_search#Vertex_orderings
-///
 //===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
-#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/PriorityQueue.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -70,90 +69,6 @@ FunctionPass *llvm::createWebAssemblyCFGStackify() {
   return new WebAssemblyCFGStackify();
 }
 
-static void EliminateMultipleEntryLoops(MachineFunction &MF,
-                                        const MachineLoopInfo &MLI) {
-  SmallPtrSet<MachineBasicBlock *, 8> InSet;
-  for (scc_iterator<MachineFunction *> I = scc_begin(&MF), E = scc_end(&MF);
-       I != E; ++I) {
-    const std::vector<MachineBasicBlock *> &CurrentSCC = *I;
-
-    // Skip trivial SCCs.
-    if (CurrentSCC.size() == 1)
-      continue;
-
-    InSet.insert(CurrentSCC.begin(), CurrentSCC.end());
-    MachineBasicBlock *Header = nullptr;
-    for (MachineBasicBlock *MBB : CurrentSCC) {
-      for (MachineBasicBlock *Pred : MBB->predecessors()) {
-        if (InSet.count(Pred))
-          continue;
-        if (!Header) {
-          Header = MBB;
-          break;
-        }
-        // TODO: Implement multiple-entry loops.
-        report_fatal_error("multiple-entry loops are not supported yet");
-      }
-    }
-    assert(MLI.isLoopHeader(Header));
-
-    InSet.clear();
-  }
-}
-
-namespace {
-/// Post-order traversal stack entry.
-struct POStackEntry {
-  MachineBasicBlock *MBB;
-  SmallVector<MachineBasicBlock *, 0> Succs;
-
-  POStackEntry(MachineBasicBlock *MBB, MachineFunction &MF,
-               const MachineLoopInfo &MLI);
-};
-} // end anonymous namespace
-
-static bool LoopContains(const MachineLoop *Loop,
-                         const MachineBasicBlock *MBB) {
-  return Loop ? Loop->contains(MBB) : true;
-}
-
-POStackEntry::POStackEntry(MachineBasicBlock *MBB, MachineFunction &MF,
-                           const MachineLoopInfo &MLI)
-    : MBB(MBB), Succs(MBB->successors()) {
-  // RPO is not a unique form, since at every basic block with multiple
-  // successors, the DFS has to pick which order to visit the successors in.
-  // Sort them strategically (see below).
-  MachineLoop *Loop = MLI.getLoopFor(MBB);
-  MachineFunction::iterator Next = next(MachineFunction::iterator(MBB));
-  MachineBasicBlock *LayoutSucc = Next == MF.end() ? nullptr : &*Next;
-  std::stable_sort(
-      Succs.begin(), Succs.end(),
-      [=, &MLI](const MachineBasicBlock *A, const MachineBasicBlock *B) {
-        if (A == B)
-          return false;
-
-        // Keep loops contiguous by preferring the block that's in the same
-        // loop.
-        bool LoopContainsA = LoopContains(Loop, A);
-        bool LoopContainsB = LoopContains(Loop, B);
-        if (LoopContainsA && !LoopContainsB)
-          return true;
-        if (!LoopContainsA && LoopContainsB)
-          return false;
-
-        // Minimize perturbation by preferring the block which is the immediate
-        // layout successor.
-        if (A == LayoutSucc)
-          return true;
-        if (B == LayoutSucc)
-          return false;
-
-        // TODO: More sophisticated orderings may be profitable here.
-
-        return false;
-      });
-}
-
 /// Return the "bottom" block of a loop. This differs from
 /// MachineLoop::getBottomBlock in that it works even if the loop is
 /// discontiguous.
@@ -165,53 +80,166 @@ static MachineBasicBlock *LoopBottom(const MachineLoop *Loop) {
   return Bottom;
 }
 
-/// Sort the blocks in RPO, taking special care to make sure that loops are
-/// contiguous even in the case of split backedges.
-///
-/// TODO: Determine whether RPO is actually worthwhile, or whether we should
-/// move to just a stable-topological-sort-based approach that would preserve
-/// more of the original order.
-static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) {
-  // Note that we do our own RPO rather than using
-  // "llvm/ADT/PostOrderIterator.h" because we want control over the order that
-  // successors are visited in (see above). Also, we can sort the blocks in the
-  // MachineFunction as we go.
-  SmallPtrSet<MachineBasicBlock *, 16> Visited;
-  SmallVector<POStackEntry, 16> Stack;
-
-  MachineBasicBlock *EntryBlock = &*MF.begin();
-  Visited.insert(EntryBlock);
-  Stack.push_back(POStackEntry(EntryBlock, MF, MLI));
-
-  for (;;) {
-    POStackEntry &Entry = Stack.back();
-    SmallVectorImpl<MachineBasicBlock *> &Succs = Entry.Succs;
-    if (!Succs.empty()) {
-      MachineBasicBlock *Succ = Succs.pop_back_val();
-      if (Visited.insert(Succ).second)
-        Stack.push_back(POStackEntry(Succ, MF, MLI));
-      continue;
-    }
+static void MaybeUpdateTerminator(MachineBasicBlock *MBB) {
+#ifndef NDEBUG
+  bool AnyBarrier = false;
+#endif
+  bool AllAnalyzable = true;
+  for (const MachineInstr &Term : MBB->terminators()) {
+#ifndef NDEBUG
+    AnyBarrier |= Term.isBarrier();
+#endif
+    AllAnalyzable &= Term.isBranch() && !Term.isIndirectBranch();
+  }
+  assert((AnyBarrier || AllAnalyzable) &&
+         "AnalyzeBranch needs to analyze any block with a fallthrough");
+  if (AllAnalyzable)
+    MBB->updateTerminator();
+}
 
-    // Put the block in its position in the MachineFunction.
-    MachineBasicBlock &MBB = *Entry.MBB;
-    MBB.moveBefore(&*MF.begin());
-
-    // Branch instructions may utilize a fallthrough, so update them if a
-    // fallthrough has been added or removed.
-    if (!MBB.empty() && MBB.back().isTerminator() && !MBB.back().isBranch() &&
-        !MBB.back().isBarrier())
-      report_fatal_error(
-          "Non-branch terminator with fallthrough cannot yet be rewritten");
-    if (MBB.empty() || !MBB.back().isTerminator() || MBB.back().isBranch())
-      MBB.updateTerminator();
-
-    Stack.pop_back();
-    if (Stack.empty())
-      break;
+namespace {
+/// Sort blocks by their number.
+struct CompareBlockNumbers {
+  bool operator()(const MachineBasicBlock *A,
+                  const MachineBasicBlock *B) const {
+    return A->getNumber() > B->getNumber();
+  }
+};
+/// Sort blocks by their number in the opposite order..
+struct CompareBlockNumbersBackwards {
+  bool operator()(const MachineBasicBlock *A,
+                  const MachineBasicBlock *B) const {
+    return A->getNumber() < B->getNumber();
   }
+};
+/// Bookkeeping for a loop to help ensure that we don't mix blocks not dominated
+/// by the loop header among the loop's blocks.
+struct Entry {
+  const MachineLoop *Loop;
+  unsigned NumBlocksLeft;
+
+  /// List of blocks not dominated by Loop's header that are deferred until
+  /// after all of Loop's blocks have been seen.
+  std::vector<MachineBasicBlock *> Deferred;
+
+  explicit Entry(const MachineLoop *L)
+      : Loop(L), NumBlocksLeft(L->getNumBlocks()) {}
+};
+}
 
-  // Now that we've sorted the blocks in RPO, renumber them.
+/// Sort the blocks, taking special care to make sure that loops are not
+/// interrupted by blocks not dominated by their header.
+/// TODO: There are many opportunities for improving the heuristics here.
+/// Explore them.
+static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
+                       const MachineDominatorTree &MDT) {
+  // Prepare for a topological sort: Record the number of predecessors each
+  // block has, ignoring loop backedges.
+  MF.RenumberBlocks();
+  SmallVector<unsigned, 16> NumPredsLeft(MF.getNumBlockIDs(), 0);
+  for (MachineBasicBlock &MBB : MF) {
+    unsigned N = MBB.pred_size();
+    if (MachineLoop *L = MLI.getLoopFor(&MBB))
+      if (L->getHeader() == &MBB)
+        for (const MachineBasicBlock *Pred : MBB.predecessors())
+          if (L->contains(Pred))
+            --N;
+    NumPredsLeft[MBB.getNumber()] = N;
+  }
+
+  // Topological sort the CFG, with additional constraints:
+  //  - Between a loop header and the last block in the loop, there can be
+  //    no blocks not dominated by the loop header.
+  //  - It's desirable to preserve the original block order when possible.
+  // We use two ready lists; Preferred and Ready. Preferred has recently
+  // processed sucessors, to help preserve block sequences from the original
+  // order. Ready has the remaining ready blocks.
+  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+                CompareBlockNumbers>
+      Preferred;
+  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+                CompareBlockNumbersBackwards>
+      Ready;
+  SmallVector<Entry, 4> Loops;
+  for (MachineBasicBlock *MBB = &MF.front();;) {
+    const MachineLoop *L = MLI.getLoopFor(MBB);
+    if (L) {
+      // If MBB is a loop header, add it to the active loop list. We can't put
+      // any blocks that it doesn't dominate until we see the end of the loop.
+      if (L->getHeader() == MBB)
+        Loops.push_back(Entry(L));
+      // For each active loop the block is in, decrement the count. If MBB is
+      // the last block in an active loop, take it off the list and pick up any
+      // blocks deferred because the header didn't dominate them.
+      for (Entry &E : Loops)
+        if (E.Loop->contains(MBB) && --E.NumBlocksLeft == 0)
+          for (auto DeferredBlock : E.Deferred)
+            Ready.push(DeferredBlock);
+      while (!Loops.empty() && Loops.back().NumBlocksLeft == 0)
+        Loops.pop_back();
+    }
+    // The main topological sort logic.
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      // Ignore backedges.
+      if (MachineLoop *SuccL = MLI.getLoopFor(Succ))
+        if (SuccL->getHeader() == Succ && SuccL->contains(MBB))
+          continue;
+      // Decrement the predecessor count. If it's now zero, it's ready.
+      if (--NumPredsLeft[Succ->getNumber()] == 0)
+        Preferred.push(Succ);
+    }
+    // Determine the block to follow MBB. First try to find a preferred block,
+    // to preserve the original block order when possible.
+    MachineBasicBlock *Next = nullptr;
+    while (!Preferred.empty()) {
+      Next = Preferred.top();
+      Preferred.pop();
+      // If X isn't dominated by the top active loop header, defer it until that
+      // loop is done.
+      if (!Loops.empty() &&
+          !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+        Loops.back().Deferred.push_back(Next);
+        Next = nullptr;
+        continue;
+      }
+      // If Next was originally ordered before MBB, and it isn't because it was
+      // loop-rotated above the header, it's not preferred.
+      if (Next->getNumber() < MBB->getNumber() &&
+          (!L || !L->contains(Next) ||
+           L->getHeader()->getNumber() < Next->getNumber())) {
+        Ready.push(Next);
+        Next = nullptr;
+        continue;
+      }
+      break;
+    }
+    // If we didn't find a suitable block in the Preferred list, check the
+    // general Ready list.
+    if (!Next) {
+      // If there are no more blocks to process, we're done.
+      if (Ready.empty()) {
+        MaybeUpdateTerminator(MBB);
+        break;
+      }
+      for (;;) {
+        Next = Ready.top();
+        Ready.pop();
+        // If Next isn't dominated by the top active loop header, defer it until
+        // that loop is done.
+        if (!Loops.empty() &&
+            !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+          Loops.back().Deferred.push_back(Next);
+          continue;
+        }
+        break;
+      }
+    }
+    // Move the next block into place and iterate.
+    Next->moveAfter(MBB);
+    MaybeUpdateTerminator(MBB);
+    MBB = Next;
+  }
+  assert(Loops.empty() && "Active loop list not finished");
   MF.RenumberBlocks();
 
 #ifndef NDEBUG
@@ -266,12 +294,26 @@ static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred,
   return false;
 }
 
+/// Test whether MI is a child of some other node in an expression tree.
+static bool IsChild(const MachineInstr &MI,
+                    const WebAssemblyFunctionInfo &MFI) {
+  if (MI.getNumOperands() == 0)
+    return false;
+  const MachineOperand &MO = MI.getOperand(0);
+  if (!MO.isReg() || MO.isImplicit() || !MO.isDef())
+    return false;
+  unsigned Reg = MO.getReg();
+  return TargetRegisterInfo::isVirtualRegister(Reg) &&
+         MFI.isVRegStackified(Reg);
+}
+
 /// Insert a BLOCK marker for branches to MBB (if needed).
 static void PlaceBlockMarker(MachineBasicBlock &MBB, MachineFunction &MF,
                              SmallVectorImpl<MachineBasicBlock *> &ScopeTops,
                              const WebAssemblyInstrInfo &TII,
                              const MachineLoopInfo &MLI,
-                             MachineDominatorTree &MDT) {
+                             MachineDominatorTree &MDT,
+                             WebAssemblyFunctionInfo &MFI) {
   // First compute the nearest common dominator of all forward non-fallthrough
   // predecessors so that we minimize the time that the BLOCK is on the stack,
   // which reduces overall stack height.
@@ -319,14 +361,15 @@ static void PlaceBlockMarker(MachineBasicBlock &MBB, MachineFunction &MF,
   MachineLoop *HeaderLoop = MLI.getLoopFor(Header);
   if (HeaderLoop && MBB.getNumber() > LoopBottom(HeaderLoop)->getNumber()) {
     // Header is the header of a loop that does not lexically contain MBB, so
-    // the BLOCK needs to be above the LOOP.
+    // the BLOCK needs to be above the LOOP, after any END constructs.
     InsertPos = Header->begin();
+    while (InsertPos->getOpcode() != WebAssembly::LOOP)
+      ++InsertPos;
   } else {
     // Otherwise, insert the BLOCK as late in Header as we can, but before the
     // beginning of the local expression tree and any nested BLOCKs.
     InsertPos = Header->getFirstTerminator();
-    while (InsertPos != Header->begin() &&
-           prev(InsertPos)->definesRegister(WebAssembly::EXPR_STACK) &&
+    while (InsertPos != Header->begin() && IsChild(*prev(InsertPos), MFI) &&
            prev(InsertPos)->getOpcode() != WebAssembly::LOOP &&
            prev(InsertPos)->getOpcode() != WebAssembly::END_BLOCK &&
            prev(InsertPos)->getOpcode() != WebAssembly::END_LOOP)
@@ -388,7 +431,7 @@ static void PlaceLoopMarker(
 
   assert((!ScopeTops[AfterLoop->getNumber()] ||
           ScopeTops[AfterLoop->getNumber()]->getNumber() < MBB.getNumber()) &&
-         "With RPO we should visit the outer-most loop for a block first.");
+         "With block sorting the outermost loop for a block should be first.");
   if (!ScopeTops[AfterLoop->getNumber()])
     ScopeTops[AfterLoop->getNumber()] = &MBB;
 }
@@ -409,7 +452,8 @@ GetDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
 /// Insert LOOP and BLOCK markers at appropriate places.
 static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
                          const WebAssemblyInstrInfo &TII,
-                         MachineDominatorTree &MDT) {
+                         MachineDominatorTree &MDT,
+                         WebAssemblyFunctionInfo &MFI) {
   // For each block whose label represents the end of a scope, record the block
   // which holds the beginning of the scope. This will allow us to quickly skip
   // over scoped regions when walking blocks. We allocate one more than the
@@ -425,7 +469,7 @@ static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
     PlaceLoopMarker(MBB, MF, ScopeTops, LoopTops, TII, MLI);
 
     // Place the BLOCK for MBB if MBB is branched to from above.
-    PlaceBlockMarker(MBB, MF, ScopeTops, TII, MLI, MDT);
+    PlaceBlockMarker(MBB, MF, ScopeTops, TII, MLI, MDT, MFI);
   }
 
   // Now rewrite references to basic blocks to be depth immediates.
@@ -478,16 +522,14 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   auto &MDT = getAnalysis<MachineDominatorTree>();
   // Liveness is not tracked for EXPR_STACK physreg.
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   MF.getRegInfo().invalidateLiveness();
 
-  // RPO sorting needs all loops to be single-entry.
-  EliminateMultipleEntryLoops(MF, MLI);
-
-  // Sort the blocks in RPO, with contiguous loops.
-  SortBlocks(MF, MLI);
+  // Sort the blocks, with contiguous loops.
+  SortBlocks(MF, MLI, MDT);
 
   // Place the BLOCK and LOOP markers to indicate the beginnings of scopes.
-  PlaceMarkers(MF, MLI, TII, MDT);
+  PlaceMarkers(MF, MLI, TII, MDT, MFI);
 
   return true;
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 1b761b1a9d73..7bfa4074849b 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -12,10 +12,13 @@
 /// class. Some of the target-specific code is generated by tablegen in the file
 /// WebAssemblyGenFastISel.inc, which is #included here.
 ///
+/// TODO: kill flags
+///
 //===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyTargetMachine.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
@@ -41,13 +44,122 @@ using namespace llvm;
 namespace {
 
 class WebAssemblyFastISel final : public FastISel {
+  // All possible address modes.
+  class Address {
+  public:
+    typedef enum { RegBase, FrameIndexBase } BaseKind;
+
+  private:
+    BaseKind Kind;
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+
+    int64_t Offset;
+
+    const GlobalValue *GV;
+
+  public:
+    // Innocuous defaults for our address.
+    Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
+    void setKind(BaseKind K) { Kind = K; }
+    BaseKind getKind() const { return Kind; }
+    bool isRegBase() const { return Kind == RegBase; }
+    bool isFIBase() const { return Kind == FrameIndexBase; }
+    void setReg(unsigned Reg) {
+      assert(isRegBase() && "Invalid base register access!");
+      Base.Reg = Reg;
+    }
+    unsigned getReg() const {
+      assert(isRegBase() && "Invalid base register access!");
+      return Base.Reg;
+    }
+    void setFI(unsigned FI) {
+      assert(isFIBase() && "Invalid base frame index access!");
+      Base.FI = FI;
+    }
+    unsigned getFI() const {
+      assert(isFIBase() && "Invalid base frame index access!");
+      return Base.FI;
+    }
+
+    void setOffset(int64_t Offset_) { Offset = Offset_; }
+    int64_t getOffset() const { return Offset; }
+    void setGlobalValue(const GlobalValue *G) { GV = G; }
+    const GlobalValue *getGlobalValue() const { return GV; }
+  };
+
   /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
   /// right decision when generating code for different targets.
   const WebAssemblySubtarget *Subtarget;
   LLVMContext *Context;
 
-  // Call handling routines.
 private:
+  // Utility helper routines
+  MVT::SimpleValueType getSimpleType(Type *Ty) {
+    EVT VT = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
+    return VT.isSimple() ? VT.getSimpleVT().SimpleTy :
+                           MVT::INVALID_SIMPLE_VALUE_TYPE;
+  }
+  MVT::SimpleValueType getLegalType(MVT::SimpleValueType VT) {
+    switch (VT) {
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+      return MVT::i32;
+    case MVT::i32:
+    case MVT::i64:
+    case MVT::f32:
+    case MVT::f64:
+      return VT;
+    default:
+      break;
+    }
+    return MVT::INVALID_SIMPLE_VALUE_TYPE;
+  }
+  bool computeAddress(const Value *Obj, Address &Addr);
+  void materializeLoadStoreOperands(Address &Addr);
+  void addLoadStoreOperands(const Address &Addr, const MachineInstrBuilder &MIB,
+                            MachineMemOperand *MMO);
+  unsigned maskI1Value(unsigned Reg, const Value *V);
+  unsigned getRegForI1Value(const Value *V, bool &Not);
+  unsigned zeroExtendToI32(unsigned Reg, const Value *V,
+                           MVT::SimpleValueType From);
+  unsigned signExtendToI32(unsigned Reg, const Value *V,
+                           MVT::SimpleValueType From);
+  unsigned zeroExtend(unsigned Reg, const Value *V,
+                      MVT::SimpleValueType From,
+                      MVT::SimpleValueType To);
+  unsigned signExtend(unsigned Reg, const Value *V,
+                      MVT::SimpleValueType From,
+                      MVT::SimpleValueType To);
+  unsigned getRegForUnsignedValue(const Value *V);
+  unsigned getRegForSignedValue(const Value *V);
+  unsigned getRegForPromotedValue(const Value *V, bool IsSigned);
+  unsigned notValue(unsigned Reg);
+  unsigned copyValue(unsigned Reg);
+
+  // Backend specific FastISel code.
+  unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+  unsigned fastMaterializeConstant(const Constant *C) override;
+  bool fastLowerArguments() override;
+
+  // Selection routines.
+  bool selectCall(const Instruction *I);
+  bool selectSelect(const Instruction *I);
+  bool selectTrunc(const Instruction *I);
+  bool selectZExt(const Instruction *I);
+  bool selectSExt(const Instruction *I);
+  bool selectICmp(const Instruction *I);
+  bool selectFCmp(const Instruction *I);
+  bool selectBitCast(const Instruction *I);
+  bool selectLoad(const Instruction *I);
+  bool selectStore(const Instruction *I);
+  bool selectBr(const Instruction *I);
+  bool selectRet(const Instruction *I);
+  bool selectUnreachable(const Instruction *I);
+
 public:
   // Backend specific FastISel code.
   WebAssemblyFastISel(FunctionLoweringInfo &FuncInfo,
@@ -64,11 +176,1001 @@ public:
 
 } // end anonymous namespace
 
+bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
+
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (auto *Ty = dyn_cast<PointerType>(Obj->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(Obj)) {
+    if (Addr.getGlobalValue())
+      return false;
+    Addr.setGlobalValue(GV);
+    return true;
+  }
+
+  switch (Opcode) {
+  default:
+    break;
+  case Instruction::BitCast: {
+    // Look through bitcasts.
+    return computeAddress(U->getOperand(0), Addr);
+  }
+  case Instruction::IntToPtr: {
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
+      return computeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::PtrToInt: {
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+      return computeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    Address SavedAddr = Addr;
+    uint64_t TmpOffset = Addr.getOffset();
+    // Iterate through the GEP folding the constants into offsets where
+    // we can.
+    for (gep_type_iterator GTI = gep_type_begin(U), E = gep_type_end(U);
+         GTI != E; ++GTI) {
+      const Value *Op = GTI.getOperand();
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+        TmpOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        for (;;) {
+          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+            // Constant-offset addressing.
+            TmpOffset += CI->getSExtValue() * S;
+            break;
+          }
+          if (S == 1 && Addr.isRegBase() && Addr.getReg() == 0) {
+            // An unscaled add of a register. Set it as the new base.
+            Addr.setReg(getRegForValue(Op));
+            break;
+          }
+          if (canFoldAddIntoGEP(U, Op)) {
+            // A compatible add with a constant operand. Fold the constant.
+            ConstantInt *CI =
+                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+            TmpOffset += CI->getSExtValue() * S;
+            // Iterate on the other operand.
+            Op = cast<AddOperator>(Op)->getOperand(0);
+            continue;
+          }
+          // Unsupported
+          goto unsupported_gep;
+        }
+      }
+    }
+    // Try to grab the base operand now.
+    Addr.setOffset(TmpOffset);
+    if (computeAddress(U->getOperand(0), Addr))
+      return true;
+    // We failed, restore everything and try the other options.
+    Addr = SavedAddr;
+  unsupported_gep:
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst *AI = cast<AllocaInst>(Obj);
+    DenseMap<const AllocaInst *, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      Addr.setKind(Address::FrameIndexBase);
+      Addr.setFI(SI->second);
+      return true;
+    }
+    break;
+  }
+  case Instruction::Add: {
+    // Adds of constants are common and easy enough.
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (isa<ConstantInt>(LHS))
+      std::swap(LHS, RHS);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      Addr.setOffset(Addr.getOffset() + CI->getSExtValue());
+      return computeAddress(LHS, Addr);
+    }
+
+    Address Backup = Addr;
+    if (computeAddress(LHS, Addr) && computeAddress(RHS, Addr))
+      return true;
+    Addr = Backup;
+
+    break;
+  }
+  case Instruction::Sub: {
+    // Subs of constants are common and easy enough.
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      Addr.setOffset(Addr.getOffset() - CI->getSExtValue());
+      return computeAddress(LHS, Addr);
+    }
+    break;
+  }
+  }
+  Addr.setReg(getRegForValue(Obj));
+  return Addr.getReg() != 0;
+}
+
+void WebAssemblyFastISel::materializeLoadStoreOperands(Address &Addr) {
+  if (Addr.isRegBase()) {
+    unsigned Reg = Addr.getReg();
+    if (Reg == 0) {
+      Reg = createResultReg(Subtarget->hasAddr64() ?
+                            &WebAssembly::I64RegClass :
+                            &WebAssembly::I32RegClass);
+      unsigned Opc = Subtarget->hasAddr64() ?
+                     WebAssembly::CONST_I64 :
+                     WebAssembly::CONST_I32;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), Reg)
+         .addImm(0);
+      Addr.setReg(Reg);
+    }
+  }
+}
+
+void WebAssemblyFastISel::addLoadStoreOperands(const Address &Addr,
+                                               const MachineInstrBuilder &MIB,
+                                               MachineMemOperand *MMO) {
+  if (const GlobalValue *GV = Addr.getGlobalValue())
+    MIB.addGlobalAddress(GV, Addr.getOffset());
+  else
+    MIB.addImm(Addr.getOffset());
+
+  if (Addr.isRegBase())
+    MIB.addReg(Addr.getReg());
+  else
+    MIB.addFrameIndex(Addr.getFI());
+
+  // Set the alignment operand (this is rewritten in SetP2AlignOperands).
+  // TODO: Disable SetP2AlignOperands for FastISel and just do it here.
+  MIB.addImm(0);
+
+  MIB.addMemOperand(MMO);
+}
+
+unsigned WebAssemblyFastISel::maskI1Value(unsigned Reg, const Value *V) {
+  return zeroExtendToI32(Reg, V, MVT::i1);
+}
+
+unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V, bool &Not) {
+  if (const ICmpInst *ICmp = dyn_cast<ICmpInst>(V))
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(ICmp->getOperand(1)))
+      if (ICmp->isEquality() && C->isZero() && C->getType()->isIntegerTy(32)) {
+        Not = ICmp->isTrueWhenEqual();
+        return getRegForValue(ICmp->getOperand(0));
+      }
+
+  if (BinaryOperator::isNot(V)) {
+    Not = true;
+    return getRegForValue(BinaryOperator::getNotArgument(V));
+  }
+
+  Not = false;
+  return maskI1Value(getRegForValue(V), V);
+}
+
+unsigned WebAssemblyFastISel::zeroExtendToI32(unsigned Reg, const Value *V,
+                                              MVT::SimpleValueType From) {
+  switch (From) {
+  case MVT::i1:
+    // If the value is naturally an i1, we don't need to mask it.
+    // TODO: Recursively examine selects, phis, and, or, xor, constants.
+    if (From == MVT::i1 && V != nullptr) {
+      if (isa<CmpInst>(V) ||
+          (isa<Argument>(V) && cast<Argument>(V)->hasZExtAttr()))
+        return copyValue(Reg);
+    }
+  case MVT::i8:
+  case MVT::i16:
+    break;
+  case MVT::i32:
+    return copyValue(Reg);
+  default:
+    return 0;
+  }
+
+  unsigned Imm = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::CONST_I32), Imm)
+    .addImm(~(~uint64_t(0) << MVT(From).getSizeInBits()));
+
+  unsigned Result = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::AND_I32), Result)
+    .addReg(Reg)
+    .addReg(Imm);
+
+  return Result;
+}
+
+unsigned WebAssemblyFastISel::signExtendToI32(unsigned Reg, const Value *V,
+                                              MVT::SimpleValueType From) {
+  switch (From) {
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+    break;
+  case MVT::i32:
+    return copyValue(Reg);
+  default:
+    return 0;
+  }
+
+  unsigned Imm = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::CONST_I32), Imm)
+    .addImm(32 - MVT(From).getSizeInBits());
+
+  unsigned Left = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::SHL_I32), Left)
+    .addReg(Reg)
+    .addReg(Imm);
+
+  unsigned Right = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::SHR_S_I32), Right)
+    .addReg(Left)
+    .addReg(Imm);
+
+  return Right;
+}
+
+unsigned WebAssemblyFastISel::zeroExtend(unsigned Reg, const Value *V,
+                                         MVT::SimpleValueType From,
+                                         MVT::SimpleValueType To) {
+  if (To == MVT::i64) {
+    if (From == MVT::i64)
+      return copyValue(Reg);
+
+    Reg = zeroExtendToI32(Reg, V, From);
+
+    unsigned Result = createResultReg(&WebAssembly::I64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(WebAssembly::I64_EXTEND_U_I32), Result)
+        .addReg(Reg);
+    return Result;
+  }
+
+  return zeroExtendToI32(Reg, V, From);
+}
+
+unsigned WebAssemblyFastISel::signExtend(unsigned Reg, const Value *V,
+                                         MVT::SimpleValueType From,
+                                         MVT::SimpleValueType To) {
+  if (To == MVT::i64) {
+    if (From == MVT::i64)
+      return copyValue(Reg);
+
+    Reg = signExtendToI32(Reg, V, From);
+
+    unsigned Result = createResultReg(&WebAssembly::I64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(WebAssembly::I64_EXTEND_S_I32), Result)
+        .addReg(Reg);
+    return Result;
+  }
+
+  return signExtendToI32(Reg, V, From);
+}
+
+unsigned WebAssemblyFastISel::getRegForUnsignedValue(const Value *V) {
+  MVT::SimpleValueType From = getSimpleType(V->getType());
+  MVT::SimpleValueType To = getLegalType(From);
+  return zeroExtend(getRegForValue(V), V, From, To);
+}
+
+unsigned WebAssemblyFastISel::getRegForSignedValue(const Value *V) {
+  MVT::SimpleValueType From = getSimpleType(V->getType());
+  MVT::SimpleValueType To = getLegalType(From);
+  return zeroExtend(getRegForValue(V), V, From, To);
+}
+
+unsigned WebAssemblyFastISel::getRegForPromotedValue(const Value *V,
+                                                     bool IsSigned) {
+  return IsSigned ? getRegForSignedValue(V) :
+                    getRegForUnsignedValue(V);
+}
+
+unsigned WebAssemblyFastISel::notValue(unsigned Reg) {
+  assert(MRI.getRegClass(Reg) == &WebAssembly::I32RegClass);
+
+  unsigned NotReg = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::EQZ_I32), NotReg)
+    .addReg(Reg);
+  return NotReg;
+}
+
+unsigned WebAssemblyFastISel::copyValue(unsigned Reg) {
+  unsigned ResultReg = createResultReg(MRI.getRegClass(Reg));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::COPY), ResultReg)
+    .addReg(Reg);
+  return ResultReg;
+}
+
+unsigned WebAssemblyFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+  DenseMap<const AllocaInst *, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(Subtarget->hasAddr64() ?
+                                         &WebAssembly::I64RegClass :
+                                         &WebAssembly::I32RegClass);
+    unsigned Opc = Subtarget->hasAddr64() ?
+                   WebAssembly::COPY_LOCAL_I64 :
+                   WebAssembly::COPY_LOCAL_I32;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+        .addFrameIndex(SI->second);
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+unsigned WebAssemblyFastISel::fastMaterializeConstant(const Constant *C) {
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) {
+    unsigned ResultReg = createResultReg(Subtarget->hasAddr64() ?
+                                         &WebAssembly::I64RegClass :
+                                         &WebAssembly::I32RegClass);
+    unsigned Opc = Subtarget->hasAddr64() ?
+                   WebAssembly::CONST_I64 :
+                   WebAssembly::CONST_I32;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+       .addGlobalAddress(GV);
+    return ResultReg;
+  }
+
+  // Let target-independent code handle it.
+  return 0;
+}
+
+bool WebAssemblyFastISel::fastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  unsigned i = 0;
+  for (auto const &Arg : F->args()) {
+    const AttributeSet &Attrs = F->getAttributes();
+    if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
+        Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
+        Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
+        Attrs.hasAttribute(i+1, Attribute::InAlloca) ||
+        Attrs.hasAttribute(i+1, Attribute::Nest))
+      return false;
+
+    Type *ArgTy = Arg.getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+      return false;
+
+    unsigned Opc;
+    const TargetRegisterClass *RC;
+    switch (getSimpleType(ArgTy)) {
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      Opc = WebAssembly::ARGUMENT_I32;
+      RC = &WebAssembly::I32RegClass;
+      break;
+    case MVT::i64:
+      Opc = WebAssembly::ARGUMENT_I64;
+      RC = &WebAssembly::I64RegClass;
+      break;
+    case MVT::f32:
+      Opc = WebAssembly::ARGUMENT_F32;
+      RC = &WebAssembly::F32RegClass;
+      break;
+    case MVT::f64:
+      Opc = WebAssembly::ARGUMENT_F64;
+      RC = &WebAssembly::F64RegClass;
+      break;
+    default:
+      return false;
+    }
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addImm(i);
+    updateValueMap(&Arg, ResultReg);
+
+    ++i;
+  }
+
+  MRI.addLiveIn(WebAssembly::ARGUMENTS);
+
+  auto *MFI = MF->getInfo<WebAssemblyFunctionInfo>();
+  for (auto const &Arg : F->args())
+    MFI->addParam(getLegalType(getSimpleType(Arg.getType())));
+
+  return true;
+}
+
+bool WebAssemblyFastISel::selectCall(const Instruction *I) {
+  const CallInst *Call = cast<CallInst>(I);
+
+  if (Call->isMustTailCall() || Call->isInlineAsm() ||
+      Call->getFunctionType()->isVarArg())
+    return false;
+
+  Function *Func = Call->getCalledFunction();
+  if (Func && Func->isIntrinsic())
+    return false;
+
+  FunctionType *FuncTy = Call->getFunctionType();
+  unsigned Opc;
+  bool IsDirect = Func != nullptr;
+  bool IsVoid = FuncTy->getReturnType()->isVoidTy();
+  unsigned ResultReg;
+  if (IsVoid) {
+    Opc = IsDirect ? WebAssembly::CALL_VOID : WebAssembly::CALL_INDIRECT_VOID;
+  } else {
+    MVT::SimpleValueType RetTy = getSimpleType(Call->getType());
+    switch (RetTy) {
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      Opc = IsDirect ? WebAssembly::CALL_I32 : WebAssembly::CALL_INDIRECT_I32;
+      ResultReg = createResultReg(&WebAssembly::I32RegClass);
+      break;
+    case MVT::i64:
+      Opc = IsDirect ? WebAssembly::CALL_I64 : WebAssembly::CALL_INDIRECT_I64;
+      ResultReg = createResultReg(&WebAssembly::I64RegClass);
+      break;
+    case MVT::f32:
+      Opc = IsDirect ? WebAssembly::CALL_F32 : WebAssembly::CALL_INDIRECT_F32;
+      ResultReg = createResultReg(&WebAssembly::F32RegClass);
+      break;
+    case MVT::f64:
+      Opc = IsDirect ? WebAssembly::CALL_F64 : WebAssembly::CALL_INDIRECT_F64;
+      ResultReg = createResultReg(&WebAssembly::F64RegClass);
+      break;
+    default:
+      return false;
+    }
+  }
+
+  SmallVector<unsigned, 8> Args;
+  for (unsigned i = 0, e = Call->getNumArgOperands(); i < e; ++i) {
+    Value *V = Call->getArgOperand(i);
+    MVT::SimpleValueType ArgTy = getSimpleType(V->getType());
+    if (ArgTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
+      return false;
+
+    const AttributeSet &Attrs = Call->getAttributes();
+    if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
+        Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
+        Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
+        Attrs.hasAttribute(i+1, Attribute::InAlloca) ||
+        Attrs.hasAttribute(i+1, Attribute::Nest))
+      return false;
+
+    unsigned Reg;
+
+    if (Attrs.hasAttribute(i+1, Attribute::SExt))
+      Reg = getRegForSignedValue(V);
+    else if (Attrs.hasAttribute(i+1, Attribute::ZExt))
+      Reg = getRegForUnsignedValue(V);
+    else
+      Reg = getRegForValue(V);
+
+    if (Reg == 0)
+      return false;
+
+    Args.push_back(Reg);
+  }
+
+  auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+
+  if (!IsVoid)
+    MIB.addReg(ResultReg, RegState::Define);
+
+  if (IsDirect)
+    MIB.addGlobalAddress(Func);
+  else
+    MIB.addReg(getRegForValue(Call->getCalledValue()));
+
+  for (unsigned ArgReg : Args)
+    MIB.addReg(ArgReg);
+
+  if (!IsVoid)
+    updateValueMap(Call, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
+  const SelectInst *Select = cast<SelectInst>(I);
+
+  bool Not;
+  unsigned CondReg  = getRegForI1Value(Select->getCondition(), Not);
+  if (CondReg == 0)
+    return false;
+
+  unsigned TrueReg  = getRegForValue(Select->getTrueValue());
+  if (TrueReg == 0)
+    return false;
+
+  unsigned FalseReg = getRegForValue(Select->getFalseValue());
+  if (FalseReg == 0)
+    return false;
+
+  if (Not)
+    std::swap(TrueReg, FalseReg);
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  switch (getSimpleType(Select->getType())) {
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    Opc = WebAssembly::SELECT_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i64:
+    Opc = WebAssembly::SELECT_I64;
+    RC = &WebAssembly::I64RegClass;
+    break;
+  case MVT::f32:
+    Opc = WebAssembly::SELECT_F32;
+    RC = &WebAssembly::F32RegClass;
+    break;
+  case MVT::f64:
+    Opc = WebAssembly::SELECT_F64;
+    RC = &WebAssembly::F64RegClass;
+    break;
+  default:
+    return false;
+  }
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+    .addReg(TrueReg)
+    .addReg(FalseReg)
+    .addReg(CondReg);
+
+  updateValueMap(Select, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectTrunc(const Instruction *I) {
+  const TruncInst *Trunc = cast<TruncInst>(I);
+
+  unsigned Reg = getRegForValue(Trunc->getOperand(0));
+  if (Reg == 0)
+    return false;
+
+  if (Trunc->getOperand(0)->getType()->isIntegerTy(64)) {
+    unsigned Result = createResultReg(&WebAssembly::I32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(WebAssembly::I32_WRAP_I64), Result)
+        .addReg(Reg);
+    Reg = Result;
+  }
+
+  updateValueMap(Trunc, Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectZExt(const Instruction *I) {
+  const ZExtInst *ZExt = cast<ZExtInst>(I);
+
+  const Value *Op = ZExt->getOperand(0);
+  MVT::SimpleValueType From = getSimpleType(Op->getType());
+  MVT::SimpleValueType To = getLegalType(getSimpleType(ZExt->getType()));
+  unsigned Reg = zeroExtend(getRegForValue(Op), Op, From, To);
+  if (Reg == 0)
+    return false;
+
+  updateValueMap(ZExt, Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectSExt(const Instruction *I) {
+  const SExtInst *SExt = cast<SExtInst>(I);
+
+  const Value *Op = SExt->getOperand(0);
+  MVT::SimpleValueType From = getSimpleType(Op->getType());
+  MVT::SimpleValueType To = getLegalType(getSimpleType(SExt->getType()));
+  unsigned Reg = signExtend(getRegForValue(Op), Op, From, To);
+  if (Reg == 0)
+    return false;
+
+  updateValueMap(SExt, Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectICmp(const Instruction *I) {
+  const ICmpInst *ICmp = cast<ICmpInst>(I);
+
+  bool I32 = getSimpleType(ICmp->getOperand(0)->getType()) != MVT::i64;
+  unsigned Opc;
+  bool isSigned = false;
+  switch (ICmp->getPredicate()) {
+  case ICmpInst::ICMP_EQ:
+    Opc = I32 ? WebAssembly::EQ_I32 : WebAssembly::EQ_I64;
+    break;
+  case ICmpInst::ICMP_NE:
+    Opc = I32 ? WebAssembly::NE_I32 : WebAssembly::NE_I64;
+    break;
+  case ICmpInst::ICMP_UGT:
+    Opc = I32 ? WebAssembly::GT_U_I32 : WebAssembly::GT_U_I64;
+    break;
+  case ICmpInst::ICMP_UGE:
+    Opc = I32 ? WebAssembly::GE_U_I32 : WebAssembly::GE_U_I64;
+    break;
+  case ICmpInst::ICMP_ULT:
+    Opc = I32 ? WebAssembly::LT_U_I32 : WebAssembly::LT_U_I64;
+    break;
+  case ICmpInst::ICMP_ULE:
+    Opc = I32 ? WebAssembly::LE_U_I32 : WebAssembly::LE_U_I64;
+    break;
+  case ICmpInst::ICMP_SGT:
+    Opc = I32 ? WebAssembly::GT_S_I32 : WebAssembly::GT_S_I64;
+    isSigned = true;
+    break;
+  case ICmpInst::ICMP_SGE:
+    Opc = I32 ? WebAssembly::GE_S_I32 : WebAssembly::GE_S_I64;
+    isSigned = true;
+    break;
+  case ICmpInst::ICMP_SLT:
+    Opc = I32 ? WebAssembly::LT_S_I32 : WebAssembly::LT_S_I64;
+    isSigned = true;
+    break;
+  case ICmpInst::ICMP_SLE:
+    Opc = I32 ? WebAssembly::LE_S_I32 : WebAssembly::LE_S_I64;
+    isSigned = true;
+    break;
+  default: return false;
+  }
+
+  unsigned LHS = getRegForPromotedValue(ICmp->getOperand(0), isSigned);
+  if (LHS == 0)
+    return false;
+
+  unsigned RHS = getRegForPromotedValue(ICmp->getOperand(1), isSigned);
+  if (RHS == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(LHS)
+      .addReg(RHS);
+  updateValueMap(ICmp, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectFCmp(const Instruction *I) {
+  const FCmpInst *FCmp = cast<FCmpInst>(I);
+
+  unsigned LHS = getRegForValue(FCmp->getOperand(0));
+  if (LHS == 0)
+    return false;
+
+  unsigned RHS = getRegForValue(FCmp->getOperand(1));
+  if (RHS == 0)
+    return false;
+
+  bool F32 = getSimpleType(FCmp->getOperand(0)->getType()) != MVT::f64;
+  unsigned Opc;
+  bool Not = false;
+  switch (FCmp->getPredicate()) {
+  case FCmpInst::FCMP_OEQ:
+    Opc = F32 ? WebAssembly::EQ_F32 : WebAssembly::EQ_F64;
+    break;
+  case FCmpInst::FCMP_UNE:
+    Opc = F32 ? WebAssembly::NE_F32 : WebAssembly::NE_F64;
+    break;
+  case FCmpInst::FCMP_OGT:
+    Opc = F32 ? WebAssembly::GT_F32 : WebAssembly::GT_F64;
+    break;
+  case FCmpInst::FCMP_OGE:
+    Opc = F32 ? WebAssembly::GE_F32 : WebAssembly::GE_F64;
+    break;
+  case FCmpInst::FCMP_OLT:
+    Opc = F32 ? WebAssembly::LT_F32 : WebAssembly::LT_F64;
+    break;
+  case FCmpInst::FCMP_OLE:
+    Opc = F32 ? WebAssembly::LE_F32 : WebAssembly::LE_F64;
+    break;
+  case FCmpInst::FCMP_UGT:
+    Opc = F32 ? WebAssembly::LE_F32 : WebAssembly::LE_F64;
+    Not = true;
+    break;
+  case FCmpInst::FCMP_UGE:
+    Opc = F32 ? WebAssembly::LT_F32 : WebAssembly::LT_F64;
+    Not = true;
+    break;
+  case FCmpInst::FCMP_ULT:
+    Opc = F32 ? WebAssembly::GE_F32 : WebAssembly::GE_F64;
+    Not = true;
+    break;
+  case FCmpInst::FCMP_ULE:
+    Opc = F32 ? WebAssembly::GT_F32 : WebAssembly::GT_F64;
+    Not = true;
+    break;
+  default:
+    return false;
+  }
+
+  unsigned ResultReg = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(LHS)
+      .addReg(RHS);
+
+  if (Not)
+    ResultReg = notValue(ResultReg);
+
+  updateValueMap(FCmp, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectBitCast(const Instruction *I) {
+  // Target-independent code can handle this, except it doesn't set the dead
+  // flag on the ARGUMENTS clobber, so we have to do that manually in order
+  // to satisfy code that expects this of isBitcast() instructions.
+  EVT VT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT RetVT = TLI.getValueType(DL, I->getType());
+  if (!VT.isSimple() || !RetVT.isSimple())
+    return false;
+
+  if (VT == RetVT) {
+    // No-op bitcast.
+    updateValueMap(I, getRegForValue(I->getOperand(0)));
+    return true;
+  }
+
+  unsigned Reg = fastEmit_ISD_BITCAST_r(VT.getSimpleVT(), RetVT.getSimpleVT(),
+                                        getRegForValue(I->getOperand(0)),
+                                        I->getOperand(0)->hasOneUse());
+  if (!Reg)
+    return false;
+  MachineBasicBlock::iterator Iter = FuncInfo.InsertPt;
+  --Iter;
+  assert(Iter->isBitcast());
+  Iter->setPhysRegsDeadExcept(ArrayRef<unsigned>(), TRI);
+  updateValueMap(I, Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectLoad(const Instruction *I) {
+  const LoadInst *Load = cast<LoadInst>(I);
+  if (Load->isAtomic())
+    return false;
+
+  Address Addr;
+  if (!computeAddress(Load->getPointerOperand(), Addr))
+    return false;
+
+  // TODO: Fold a following sign-/zero-extend into the load instruction.
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  switch (getSimpleType(Load->getType())) {
+  case MVT::i1:
+  case MVT::i8:
+    Opc = WebAssembly::LOAD8_U_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i16:
+    Opc = WebAssembly::LOAD16_U_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i32:
+    Opc = WebAssembly::LOAD_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i64:
+    Opc = WebAssembly::LOAD_I64;
+    RC = &WebAssembly::I64RegClass;
+    break;
+  case MVT::f32:
+    Opc = WebAssembly::LOAD_F32;
+    RC = &WebAssembly::F32RegClass;
+    break;
+  case MVT::f64:
+    Opc = WebAssembly::LOAD_F64;
+    RC = &WebAssembly::F64RegClass;
+    break;
+  default:
+    return false;
+  }
+
+  materializeLoadStoreOperands(Addr);
+
+  unsigned ResultReg = createResultReg(RC);
+  auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                     ResultReg);
+
+  addLoadStoreOperands(Addr, MIB, createMachineMemOperandFor(Load));
+
+  updateValueMap(Load, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectStore(const Instruction *I) {
+  const StoreInst *Store = cast<StoreInst>(I);
+  if (Store->isAtomic())
+    return false;
+
+  Address Addr;
+  if (!computeAddress(Store->getPointerOperand(), Addr))
+    return false;
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  bool VTIsi1 = false;
+  switch (getSimpleType(Store->getValueOperand()->getType())) {
+  case MVT::i1:
+    VTIsi1 = true;
+  case MVT::i8:
+    Opc = WebAssembly::STORE8_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i16:
+    Opc = WebAssembly::STORE16_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i32:
+    Opc = WebAssembly::STORE_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i64:
+    Opc = WebAssembly::STORE_I64;
+    RC = &WebAssembly::I64RegClass;
+    break;
+  case MVT::f32:
+    Opc = WebAssembly::STORE_F32;
+    RC = &WebAssembly::F32RegClass;
+    break;
+  case MVT::f64:
+    Opc = WebAssembly::STORE_F64;
+    RC = &WebAssembly::F64RegClass;
+    break;
+  default: return false;
+  }
+
+  materializeLoadStoreOperands(Addr);
+
+  unsigned ValueReg = getRegForValue(Store->getValueOperand());
+  if (VTIsi1)
+    ValueReg = maskI1Value(ValueReg, Store->getValueOperand());
+
+  unsigned ResultReg = createResultReg(RC);
+  auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                     ResultReg);
+
+  addLoadStoreOperands(Addr, MIB, createMachineMemOperandFor(Store));
+
+  MIB.addReg(ValueReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectBr(const Instruction *I) {
+  const BranchInst *Br = cast<BranchInst>(I);
+  if (Br->isUnconditional()) {
+    MachineBasicBlock *MSucc = FuncInfo.MBBMap[Br->getSuccessor(0)];
+    fastEmitBranch(MSucc, Br->getDebugLoc());
+    return true;
+  }
+
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[Br->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[Br->getSuccessor(1)];
+
+  bool Not;
+  unsigned CondReg = getRegForI1Value(Br->getCondition(), Not);
+
+  unsigned Opc = WebAssembly::BR_IF;
+  if (Not)
+    Opc = WebAssembly::BR_UNLESS;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+      .addMBB(TBB)
+      .addReg(CondReg);
+  
+  finishCondBranch(Br->getParent(), TBB, FBB);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectRet(const Instruction *I) {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+
+  if (Ret->getNumOperands() == 0) {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(WebAssembly::RETURN_VOID));
+    return true;
+  }
+
+  Value *RV = Ret->getOperand(0);
+  unsigned Opc;
+  switch (getSimpleType(RV->getType())) {
+  case MVT::i1: case MVT::i8:
+  case MVT::i16: case MVT::i32:
+    Opc = WebAssembly::RETURN_I32;
+    break;
+  case MVT::i64:
+    Opc = WebAssembly::RETURN_I64;
+    break;
+  case MVT::f32: Opc = WebAssembly::RETURN_F32; break;
+  case MVT::f64: Opc = WebAssembly::RETURN_F64; break;
+  default: return false;
+  }
+
+  unsigned Reg;
+  if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::SExt))
+    Reg = getRegForSignedValue(RV);
+  else if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::ZExt))
+    Reg = getRegForUnsignedValue(RV);
+  else
+    Reg = getRegForValue(RV);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)).addReg(Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectUnreachable(const Instruction *I) {
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::UNREACHABLE));
+  return true;
+}
+
 bool WebAssemblyFastISel::fastSelectInstruction(const Instruction *I) {
   switch (I->getOpcode()) {
-  default:
+  case Instruction::Call:
+    if (selectCall(I))
+      return true;
     break;
-    // TODO: add fast-isel selection cases here...
+  case Instruction::Select:      return selectSelect(I);
+  case Instruction::Trunc:       return selectTrunc(I);
+  case Instruction::ZExt:        return selectZExt(I);
+  case Instruction::SExt:        return selectSExt(I);
+  case Instruction::ICmp:        return selectICmp(I);
+  case Instruction::FCmp:        return selectFCmp(I);
+  case Instruction::BitCast:     return selectBitCast(I);
+  case Instruction::Load:        return selectLoad(I);
+  case Instruction::Store:       return selectStore(I);
+  case Instruction::Br:          return selectBr(I);
+  case Instruction::Ret:         return selectRet(I);
+  case Instruction::Unreachable: return selectUnreachable(I);
+  default: break;
   }
 
   // Fall back to target-independent instruction selection.
diff --git a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
new file mode 100644
index 000000000000..5dc90920e310
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -0,0 +1,296 @@
+//=- WebAssemblyFixIrreducibleControlFlow.cpp - Fix irreducible control flow -//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass that transforms irreducible control flow
+/// into reducible control flow. Irreducible control flow means multiple-entry
+/// loops; they appear as CFG cycles that are not recorded in MachineLoopInfo
+/// due to being unnatural.
+///
+/// Note that LLVM has a generic pass that lowers irreducible control flow, but
+/// it linearizes control flow, turning diamonds into two triangles, which is
+/// both unnecessary and undesirable for WebAssembly.
+///
+/// TODO: The transformation implemented here handles all irreducible control
+/// flow, without exponential code-size expansion, though it does so by creating
+/// inefficient code in many cases. Ideally, we should add other
+/// transformations, including code-duplicating cases, which can be more
+/// efficient in common cases, and they can fall back to this conservative
+/// implementation as needed.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-fix-irreducible-control-flow"
+
+namespace {
+class WebAssemblyFixIrreducibleControlFlow final : public MachineFunctionPass {
+  const char *getPassName() const override {
+    return "WebAssembly Fix Irreducible Control Flow";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  bool VisitLoop(MachineFunction &MF, MachineLoopInfo &MLI, MachineLoop *Loop);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyFixIrreducibleControlFlow() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyFixIrreducibleControlFlow::ID = 0;
+FunctionPass *llvm::createWebAssemblyFixIrreducibleControlFlow() {
+  return new WebAssemblyFixIrreducibleControlFlow();
+}
+
+namespace {
+
+/// A utility for walking the blocks of a loop, handling a nested inner
+/// loop as a monolithic conceptual block.
+class MetaBlock {
+  MachineBasicBlock *Block;
+  SmallVector<MachineBasicBlock *, 2> Preds;
+  SmallVector<MachineBasicBlock *, 2> Succs;
+
+public:
+  explicit MetaBlock(MachineBasicBlock *MBB)
+      : Block(MBB), Preds(MBB->pred_begin(), MBB->pred_end()),
+        Succs(MBB->succ_begin(), MBB->succ_end()) {}
+
+  explicit MetaBlock(MachineLoop *Loop) : Block(Loop->getHeader()) {
+    Loop->getExitBlocks(Succs);
+    for (MachineBasicBlock *Pred : Block->predecessors())
+      if (!Loop->contains(Pred))
+        Preds.push_back(Pred);
+  }
+
+  MachineBasicBlock *getBlock() const { return Block; }
+
+  const SmallVectorImpl<MachineBasicBlock *> &predecessors() const {
+    return Preds;
+  }
+  const SmallVectorImpl<MachineBasicBlock *> &successors() const {
+    return Succs;
+  }
+
+  bool operator==(const MetaBlock &MBB) { return Block == MBB.Block; }
+  bool operator!=(const MetaBlock &MBB) { return Block != MBB.Block; }
+};
+
+class SuccessorList final : public MetaBlock {
+  size_t Index;
+  size_t Num;
+
+public:
+  explicit SuccessorList(MachineBasicBlock *MBB)
+      : MetaBlock(MBB), Index(0), Num(successors().size()) {}
+
+  explicit SuccessorList(MachineLoop *Loop)
+      : MetaBlock(Loop), Index(0), Num(successors().size()) {}
+
+  bool HasNext() const { return Index != Num; }
+
+  MachineBasicBlock *Next() {
+    assert(HasNext());
+    return successors()[Index++];
+  }
+};
+
+} // end anonymous namespace
+
+bool WebAssemblyFixIrreducibleControlFlow::VisitLoop(MachineFunction &MF,
+                                                     MachineLoopInfo &MLI,
+                                                     MachineLoop *Loop) {
+  MachineBasicBlock *Header = Loop ? Loop->getHeader() : &*MF.begin();
+  SetVector<MachineBasicBlock *> RewriteSuccs;
+
+  // DFS through Loop's body, looking for for irreducible control flow. Loop is
+  // natural, and we stay in its body, and we treat any nested loops
+  // monolithically, so any cycles we encounter indicate irreducibility.
+  SmallPtrSet<MachineBasicBlock *, 8> OnStack;
+  SmallPtrSet<MachineBasicBlock *, 8> Visited;
+  SmallVector<SuccessorList, 4> LoopWorklist;
+  LoopWorklist.push_back(SuccessorList(Header));
+  OnStack.insert(Header);
+  Visited.insert(Header);
+  while (!LoopWorklist.empty()) {
+    SuccessorList &Top = LoopWorklist.back();
+    if (Top.HasNext()) {
+      MachineBasicBlock *Next = Top.Next();
+      if (Next == Header || (Loop && !Loop->contains(Next)))
+        continue;
+      if (LLVM_LIKELY(OnStack.insert(Next).second)) {
+        if (!Visited.insert(Next).second) {
+          OnStack.erase(Next);
+          continue;
+        }
+        MachineLoop *InnerLoop = MLI.getLoopFor(Next);
+        if (InnerLoop != Loop)
+          LoopWorklist.push_back(SuccessorList(InnerLoop));
+        else
+          LoopWorklist.push_back(SuccessorList(Next));
+      } else {
+        RewriteSuccs.insert(Top.getBlock());
+      }
+      continue;
+    }
+    OnStack.erase(Top.getBlock());
+    LoopWorklist.pop_back();
+  }
+
+  // Most likely, we didn't find any irreducible control flow.
+  if (LLVM_LIKELY(RewriteSuccs.empty()))
+    return false;
+
+  DEBUG(dbgs() << "Irreducible control flow detected!\n");
+
+  // Ok. We have irreducible control flow! Create a dispatch block which will
+  // contains a jump table to any block in the problematic set of blocks.
+  MachineBasicBlock *Dispatch = MF.CreateMachineBasicBlock();
+  MF.insert(MF.end(), Dispatch);
+  MLI.changeLoopFor(Dispatch, Loop);
+
+  // Add the jump table.
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  MachineInstrBuilder MIB = BuildMI(*Dispatch, Dispatch->end(), DebugLoc(),
+                                    TII.get(WebAssembly::BR_TABLE_I32));
+
+  // Add the register which will be used to tell the jump table which block to
+  // jump to.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+  MIB.addReg(Reg);
+
+  // Collect all the blocks which need to have their successors rewritten,
+  // add the successors to the jump table, and remember their index.
+  DenseMap<MachineBasicBlock *, unsigned> Indices;
+  SmallVector<MachineBasicBlock *, 4> SuccWorklist(RewriteSuccs.begin(),
+                                                   RewriteSuccs.end());
+  while (!SuccWorklist.empty()) {
+    MachineBasicBlock *MBB = SuccWorklist.pop_back_val();
+    auto Pair = Indices.insert(std::make_pair(MBB, 0));
+    if (!Pair.second)
+      continue;
+
+    unsigned Index = MIB.getInstr()->getNumExplicitOperands() - 1;
+    DEBUG(dbgs() << "MBB#" << MBB->getNumber() << " has index " << Index
+                 << "\n");
+
+    Pair.first->second = Index;
+    for (auto Pred : MBB->predecessors())
+      RewriteSuccs.insert(Pred);
+
+    MIB.addMBB(MBB);
+    Dispatch->addSuccessor(MBB);
+
+    MetaBlock Meta(MBB);
+    for (auto *Succ : Meta.successors())
+      if (Succ != Header && (!Loop || Loop->contains(Succ)))
+        SuccWorklist.push_back(Succ);
+  }
+
+  // Rewrite the problematic successors for every block in RewriteSuccs.
+  // For simplicity, we just introduce a new block for every edge we need to
+  // rewrite. Fancier things are possible.
+  for (MachineBasicBlock *MBB : RewriteSuccs) {
+    DenseMap<MachineBasicBlock *, MachineBasicBlock *> Map;
+    for (auto *Succ : MBB->successors()) {
+      if (!Indices.count(Succ))
+        continue;
+
+      MachineBasicBlock *Split = MF.CreateMachineBasicBlock();
+      MF.insert(MBB->isLayoutSuccessor(Succ) ? MachineFunction::iterator(Succ)
+                                             : MF.end(),
+                Split);
+      MLI.changeLoopFor(Split, Loop);
+
+      // Set the jump table's register of the index of the block we wish to
+      // jump to, and jump to the jump table.
+      BuildMI(*Split, Split->end(), DebugLoc(), TII.get(WebAssembly::CONST_I32),
+              Reg)
+          .addImm(Indices[Succ]);
+      BuildMI(*Split, Split->end(), DebugLoc(), TII.get(WebAssembly::BR))
+          .addMBB(Dispatch);
+      Split->addSuccessor(Dispatch);
+      Map[Succ] = Split;
+    }
+    // Remap the terminator operands and the successor list.
+    for (MachineInstr &Term : MBB->terminators())
+      for (auto &Op : Term.explicit_uses())
+        if (Op.isMBB() && Indices.count(Op.getMBB()))
+          Op.setMBB(Map[Op.getMBB()]);
+    for (auto Rewrite : Map)
+      MBB->replaceSuccessor(Rewrite.first, Rewrite.second);
+  }
+
+  // Create a fake default label, because br_table requires one.
+  MIB.addMBB(MIB.getInstr()
+                 ->getOperand(MIB.getInstr()->getNumExplicitOperands() - 1)
+                 .getMBB());
+
+  return true;
+}
+
+bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
+    MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Fixing Irreducible Control Flow **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  bool Changed = false;
+  auto &MLI = getAnalysis<MachineLoopInfo>();
+
+  // Visit the function body, which is identified as a null loop.
+  Changed |= VisitLoop(MF, MLI, nullptr);
+
+  // Visit all the loops.
+  SmallVector<MachineLoop *, 8> Worklist(MLI.begin(), MLI.end());
+  while (!Worklist.empty()) {
+    MachineLoop *CurLoop = Worklist.pop_back_val();
+    Worklist.append(CurLoop->begin(), CurLoop->end());
+    Changed |= VisitLoop(MF, MLI, CurLoop);
+  }
+
+  // If we made any changes, completely recompute everything.
+  if (LLVM_UNLIKELY(Changed)) {
+    DEBUG(dbgs() << "Recomputing dominators and loops.\n");
+    MF.getRegInfo().invalidateLiveness();
+    MF.RenumberBlocks();
+    getAnalysis<MachineDominatorTree>().runOnMachineFunction(MF);
+    MLI.runOnMachineFunction(MF);
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 0eefd57f1f2c..0a5782e5c287 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -34,10 +34,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-frame-info"
 
-// TODO: Implement a red zone?
 // TODO: wasm64
-// TODO: Prolog/epilog should be stackified too. This pass runs after register
-//       stackification, so we'll have to do it manually.
 // TODO: Emit TargetOpcode::CFI_INSTRUCTION instructions
 
 /// Return true if the specified function should have a dedicated frame pointer
@@ -46,7 +43,7 @@ bool WebAssemblyFrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const auto *RegInfo =
       MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
-  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() ||
+  return MFI->isFrameAddressTaken() || MFI->hasVarSizedObjects() ||
          MFI->hasStackMap() || MFI->hasPatchPoint() ||
          RegInfo->needsStackRealignment(MF);
 }
@@ -62,63 +59,64 @@ bool WebAssemblyFrameLowering::hasReservedCallFrame(
 }
 
 
-/// Adjust the stack pointer by a constant amount.
-static void adjustStackPointer(unsigned StackSize,
-                               bool AdjustUp,
-                               MachineFunction& MF,
-                               MachineBasicBlock& MBB,
-                               const TargetInstrInfo* TII,
-                               MachineBasicBlock::iterator InsertPt,
-                               const DebugLoc& DL) {
-  auto &MRI = MF.getRegInfo();
-  unsigned SPReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-  auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer");
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), SPReg)
-      .addExternalSymbol(SPSymbol);
-  // This MachinePointerInfo should reference __stack_pointer as well but
-  // doesn't because MachinePointerInfo() takes a GV which we don't have for
-  // __stack_pointer. TODO: check if PseudoSourceValue::ExternalSymbolCallEntry
-  // is appropriate instead. (likewise for EmitEpologue below)
-  auto *LoadMMO = new MachineMemOperand(MachinePointerInfo(),
-                                        MachineMemOperand::MOLoad, 4, 4);
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
-      .addImm(0)
-      .addReg(SPReg)
-      .addMemOperand(LoadMMO);
-  // Add/Subtract the frame size
-  unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
-      .addImm(StackSize);
-  BuildMI(MBB, InsertPt, DL,
-          TII->get(AdjustUp ? WebAssembly::ADD_I32 : WebAssembly::SUB_I32),
-          WebAssembly::SP32)
-      .addReg(SPReg)
-      .addReg(OffsetReg);
-  // The SP32 register now has the new stacktop. Also write it back to memory.
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
-      .addExternalSymbol(SPSymbol);
-  auto *MMO = new MachineMemOperand(MachinePointerInfo(),
+/// Returns true if this function needs a local user-space stack pointer.
+/// Unlike a machine stack pointer, the wasm user stack pointer is a global
+/// variable, so it is loaded into a register in the prolog.
+bool WebAssemblyFrameLowering::needsSP(const MachineFunction &MF,
+                                       const MachineFrameInfo &MFI) const {
+  return MFI.getStackSize() || MFI.adjustsStack() || hasFP(MF);
+}
+
+/// Returns true if the local user-space stack pointer needs to be written back
+/// to memory by this function (this is not meaningful if needsSP is false). If
+/// false, the stack red zone can be used and only a local SP is needed.
+bool WebAssemblyFrameLowering::needsSPWriteback(
+    const MachineFunction &MF, const MachineFrameInfo &MFI) const {
+  assert(needsSP(MF, MFI));
+  return MFI.getStackSize() > RedZoneSize || MFI.hasCalls() ||
+         MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+}
+
+static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
+                            MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator &InsertAddr,
+                            MachineBasicBlock::iterator &InsertStore,
+                            const DebugLoc &DL) {
+  const char *ES = "__stack_pointer";
+  auto *SPSymbol = MF.createExternalSymbolName(ES);
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterClass *PtrRC =
+      MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+  unsigned Zero = MRI.createVirtualRegister(PtrRC);
+  unsigned Drop = MRI.createVirtualRegister(PtrRC);
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+  BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), Zero)
+      .addImm(0);
+  auto *MMO = new MachineMemOperand(MachinePointerInfo(MF.getPSVManager()
+                                        .getExternalSymbolCallEntry(ES)),
                                     MachineMemOperand::MOStore, 4, 4);
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::STORE_I32), WebAssembly::SP32)
-      .addImm(0)
-      .addReg(OffsetReg)
-      .addReg(WebAssembly::SP32)
+  BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32), Drop)
+      .addExternalSymbol(SPSymbol)
+      .addReg(Zero)
+      .addImm(2)  // p2align
+      .addReg(SrcReg)
       .addMemOperand(MMO);
 }
 
-void WebAssemblyFrameLowering::eliminateCallFramePseudoInstr(
+MachineBasicBlock::iterator
+WebAssemblyFrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
-  const auto *TII =
-      static_cast<const WebAssemblyInstrInfo*>(MF.getSubtarget().getInstrInfo());
-  DebugLoc DL = I->getDebugLoc();
-  unsigned Opc = I->getOpcode();
-  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
-  unsigned Amount = I->getOperand(0).getImm();
-  if (Amount)
-    adjustStackPointer(Amount, IsDestroy, MF, MBB,
-                       TII, I, DL);
-  MBB.erase(I);
+  assert(!I->getOperand(0).getImm() && hasFP(MF) &&
+         "Call frame pseudos should only be used for dynamic stack adjustment");
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  if (I->getOpcode() == TII->getCallFrameDestroyOpcode() &&
+      needsSPWriteback(MF, *MF.getFrameInfo())) {
+    DebugLoc DL = I->getDebugLoc();
+    writeSPToMemory(WebAssembly::SP32, MF, MBB, I, I, DL);
+  }
+  return MBB.erase(I);
 }
 
 void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
@@ -127,49 +125,91 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
   auto *MFI = MF.getFrameInfo();
   assert(MFI->getCalleeSavedInfo().empty() &&
          "WebAssembly should not have callee-saved registers");
-  assert(!hasFP(MF) && "Functions needing frame pointers not yet supported");
+
+  if (!needsSP(MF, *MFI)) return;
   uint64_t StackSize = MFI->getStackSize();
-  if (!StackSize && (!MFI->adjustsStack() || MFI->getMaxCallFrameSize() == 0))
-    return;
 
-  const auto *TII = MF.getSubtarget().getInstrInfo();
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &MRI = MF.getRegInfo();
 
   auto InsertPt = MBB.begin();
   DebugLoc DL;
 
-  adjustStackPointer(StackSize, false, MF, MBB, TII, InsertPt, DL);
+  const TargetRegisterClass *PtrRC =
+      MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+  unsigned Zero = MRI.createVirtualRegister(PtrRC);
+  unsigned SPReg = MRI.createVirtualRegister(PtrRC);
+  const char *ES = "__stack_pointer";
+  auto *SPSymbol = MF.createExternalSymbolName(ES);
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
+      .addImm(0);
+  auto *LoadMMO = new MachineMemOperand(MachinePointerInfo(MF.getPSVManager()
+                                            .getExternalSymbolCallEntry(ES)),
+                                        MachineMemOperand::MOLoad, 4, 4);
+  // Load the SP value.
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32),
+          StackSize ? SPReg : (unsigned)WebAssembly::SP32)
+      .addExternalSymbol(SPSymbol)
+      .addReg(Zero)    // addr
+      .addImm(2)       // p2align
+      .addMemOperand(LoadMMO);
+
+  if (StackSize) {
+    // Subtract the frame size
+    unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+        .addImm(StackSize);
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::SUB_I32),
+            WebAssembly::SP32)
+        .addReg(SPReg)
+        .addReg(OffsetReg);
+  }
+  if (hasFP(MF)) {
+    // Unlike most conventional targets (where FP points to the saved FP),
+    // FP points to the bottom of the fixed-size locals, so we can use positive
+    // offsets in load/store instructions.
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY),
+            WebAssembly::FP32)
+        .addReg(WebAssembly::SP32);
+  }
+  if (StackSize && needsSPWriteback(MF, *MFI)) {
+    writeSPToMemory(WebAssembly::SP32, MF, MBB, InsertPt, InsertPt, DL);
+  }
 }
 
 void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
                                             MachineBasicBlock &MBB) const {
-  uint64_t StackSize = MF.getFrameInfo()->getStackSize();
-  if (!StackSize)
-    return;
-  const auto *TII = MF.getSubtarget().getInstrInfo();
+  auto *MFI = MF.getFrameInfo();
+  uint64_t StackSize = MFI->getStackSize();
+  if (!needsSP(MF, *MFI) || !needsSPWriteback(MF, *MFI)) return;
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
   auto &MRI = MF.getRegInfo();
-  unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
   auto InsertPt = MBB.getFirstTerminator();
   DebugLoc DL;
 
-  if (InsertPt != MBB.end()) {
+  if (InsertPt != MBB.end())
     DL = InsertPt->getDebugLoc();
+
+  // Restore the stack pointer. If we had fixed-size locals, add the offset
+  // subtracted in the prolog.
+  unsigned SPReg = 0;
+  MachineBasicBlock::iterator InsertAddr = InsertPt;
+  if (StackSize) {
+    const TargetRegisterClass *PtrRC =
+        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+    unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
+    InsertAddr =
+        BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+            .addImm(StackSize);
+    // In the epilog we don't need to write the result back to the SP32 physreg
+    // because it won't be used again. We can use a stackified register instead.
+    SPReg = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::ADD_I32), SPReg)
+        .addReg(hasFP(MF) ? WebAssembly::FP32 : WebAssembly::SP32)
+        .addReg(OffsetReg);
+  } else {
+    SPReg = hasFP(MF) ? WebAssembly::FP32 : WebAssembly::SP32;
   }
 
-  // Restore the stack pointer. Without FP its value is just SP32 - stacksize
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
-      .addImm(StackSize);
-  auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer");
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::ADD_I32), WebAssembly::SP32)
-      .addReg(WebAssembly::SP32)
-      .addReg(OffsetReg);
-  // Re-use OffsetReg to hold the address of the stacktop
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
-      .addExternalSymbol(SPSymbol);
-  auto *MMO = new MachineMemOperand(MachinePointerInfo(),
-                                    MachineMemOperand::MOStore, 4, 4);
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::STORE_I32), WebAssembly::SP32)
-      .addImm(0)
-      .addReg(OffsetReg)
-      .addReg(WebAssembly::SP32)
-      .addMemOperand(MMO);
+  writeSPToMemory(SPReg, MF, MBB, InsertAddr, InsertPt, DL);
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index 5f4708fe77ed..e20fc5df7443 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -19,18 +19,24 @@
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
+class MachineFrameInfo;
 
 class WebAssemblyFrameLowering final : public TargetFrameLowering {
-public:
+ public:
+  /// Size of the red zone for the user stack (leaf functions can use this much
+  /// space below the stack pointer without writing it back to memory).
+  // TODO: (ABI) Revisit and decide how large it should be.
+  static const size_t RedZoneSize = 128;
+
   WebAssemblyFrameLowering()
       : TargetFrameLowering(StackGrowsDown, /*StackAlignment=*/16,
                             /*LocalAreaOffset=*/0,
                             /*TransientStackAlignment=*/16,
                             /*StackRealignable=*/true) {}
 
-  void
-  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator I) const override;
+  MachineBasicBlock::iterator eliminateCallFramePseudoInstr(
+      MachineFunction &MF, MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator I) const override;
 
   /// These methods insert prolog and epilog code into the function.
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
@@ -38,8 +44,13 @@ public:
 
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+ private:
+  bool needsSP(const MachineFunction &MF, const MachineFrameInfo &MFI) const;
+  bool needsSPWriteback(const MachineFunction &MF,
+                        const MachineFrameInfo &MFI) const;
 };
 
-} // end namespace llvm
+}  // end namespace llvm
 
 #endif
diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def
index 3a03fa55b220..2f0f106ef5b7 100644
--- a/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -20,6 +20,6 @@ HANDLE_NODETYPE(RETURN)
 HANDLE_NODETYPE(ARGUMENT)
 HANDLE_NODETYPE(Wrapper)
 HANDLE_NODETYPE(BR_IF)
-HANDLE_NODETYPE(TABLESWITCH)
+HANDLE_NODETYPE(BR_TABLE)
 
 // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
diff --git a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 8390f797c43e..88c38b3602b9 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -54,7 +54,7 @@ public:
     return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
-  SDNode *Select(SDNode *Node) override;
+  void Select(SDNode *Node) override;
 
   bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
@@ -67,7 +67,7 @@ private:
 };
 } // end anonymous namespace
 
-SDNode *WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
+void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected.
   DEBUG(errs() << "Selecting: ");
   DEBUG(Node->dump(CurDAG));
@@ -77,11 +77,10 @@ SDNode *WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
-    return nullptr;
+    return;
   }
 
   // Few custom selection stuff.
-  SDNode *ResNode = nullptr;
   EVT VT = Node->getValueType(0);
 
   switch (Node->getOpcode()) {
@@ -92,16 +91,7 @@ SDNode *WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
   }
 
   // Select the default instruction.
-  ResNode = SelectCode(Node);
-
-  DEBUG(errs() << "=> ");
-  if (ResNode == nullptr || ResNode == Node)
-    DEBUG(Node->dump(CurDAG));
-  else
-    DEBUG(ResNode->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  return ResNode;
+  SelectCode(Node);
 }
 
 bool WebAssemblyDAGToDAGISel::SelectInlineAsmMemoryOperand(
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index e9933b092988..9e7731997d58 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -26,7 +26,6 @@
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -35,61 +34,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-lower"
 
-namespace {
-// Diagnostic information for unimplemented or unsupported feature reporting.
-// TODO: This code is copied from BPF and AMDGPU; consider factoring it out
-// and sharing code.
-class DiagnosticInfoUnsupported final : public DiagnosticInfo {
-private:
-  // Debug location where this diagnostic is triggered.
-  DebugLoc DLoc;
-  const Twine &Description;
-  const Function &Fn;
-  SDValue Value;
-
-  static int KindID;
-
-  static int getKindID() {
-    if (KindID == 0)
-      KindID = llvm::getNextAvailablePluginDiagnosticKind();
-    return KindID;
-  }
-
-public:
-  DiagnosticInfoUnsupported(SDLoc DLoc, const Function &Fn, const Twine &Desc,
-                            SDValue Value)
-      : DiagnosticInfo(getKindID(), DS_Error), DLoc(DLoc.getDebugLoc()),
-        Description(Desc), Fn(Fn), Value(Value) {}
-
-  void print(DiagnosticPrinter &DP) const override {
-    std::string Str;
-    raw_string_ostream OS(Str);
-
-    if (DLoc) {
-      auto DIL = DLoc.get();
-      StringRef Filename = DIL->getFilename();
-      unsigned Line = DIL->getLine();
-      unsigned Column = DIL->getColumn();
-      OS << Filename << ':' << Line << ':' << Column << ' ';
-    }
-
-    OS << "in function " << Fn.getName() << ' ' << *Fn.getFunctionType() << '\n'
-       << Description;
-    if (Value)
-      Value->print(OS);
-    OS << '\n';
-    OS.flush();
-    DP << Str;
-  }
-
-  static bool classof(const DiagnosticInfo *DI) {
-    return DI->getKind() == getKindID();
-  }
-};
-
-int DiagnosticInfoUnsupported::KindID = 0;
-} // end anonymous namespace
-
 WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     const TargetMachine &TM, const WebAssemblySubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -116,6 +60,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   setOperationAction(ISD::GlobalAddress, MVTPtr, Custom);
   setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom);
   setOperationAction(ISD::JumpTable, MVTPtr, Custom);
+  setOperationAction(ISD::BlockAddress, MVTPtr, Custom);
+  setOperationAction(ISD::BRIND, MVT::Other, Custom);
 
   // Take the default expansion for va_arg, va_copy, and va_end. There is no
   // default action for va_start, so we do that custom.
@@ -148,7 +94,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   for (auto T : {MVT::i32, MVT::i64}) {
     // Expand unavailable integer operations.
     for (auto Op :
-         {ISD::BSWAP, ISD::ROTL, ISD::ROTR, ISD::SMUL_LOHI, ISD::UMUL_LOHI,
+         {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI,
           ISD::MULHS, ISD::MULHU, ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS,
           ISD::SRA_PARTS, ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC,
           ISD::SUBE}) {
@@ -167,6 +113,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVTPtr, Expand);
 
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+  setOperationAction(ISD::CopyToReg, MVT::Other, Custom);
 
   // Expand these forms; we pattern-match the forms that we can handle in isel.
   for (auto T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
@@ -204,13 +151,14 @@ bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
 MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/,
                                                       EVT VT) const {
   unsigned BitWidth = NextPowerOf2(VT.getSizeInBits() - 1);
-  if (BitWidth > 1 && BitWidth < 8)
-    BitWidth = 8;
+  if (BitWidth > 1 && BitWidth < 8) BitWidth = 8;
 
   if (BitWidth > 64) {
-    BitWidth = 64;
+    // The shift will be lowered to a libcall, and compiler-rt libcalls expect
+    // the count to be an i32.
+    BitWidth = 32;
     assert(BitWidth >= Log2_32_Ceil(VT.getSizeInBits()) &&
-           "64-bit shift counts ought to be enough for anyone");
+           "32-bit shift counts ought to be enough for anyone");
   }
 
   MVT Result = MVT::getIntegerVT(BitWidth);
@@ -219,13 +167,13 @@ MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/,
   return Result;
 }
 
-const char *
-WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const {
+const char *WebAssemblyTargetLowering::getTargetNodeName(
+    unsigned Opcode) const {
   switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
-  case WebAssemblyISD::FIRST_NUMBER:
-    break;
-#define HANDLE_NODETYPE(NODE)                                                  \
-  case WebAssemblyISD::NODE:                                                   \
+    case WebAssemblyISD::FIRST_NUMBER:
+      break;
+#define HANDLE_NODETYPE(NODE) \
+  case WebAssemblyISD::NODE:  \
     return "WebAssemblyISD::" #NODE;
 #include "WebAssemblyISD.def"
 #undef HANDLE_NODETYPE
@@ -240,17 +188,17 @@ WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
   // WebAssembly register class.
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
-    case 'r':
-      assert(VT != MVT::iPTR && "Pointer MVT not expected here");
-      if (VT.isInteger() && !VT.isVector()) {
-        if (VT.getSizeInBits() <= 32)
-          return std::make_pair(0U, &WebAssembly::I32RegClass);
-        if (VT.getSizeInBits() <= 64)
-          return std::make_pair(0U, &WebAssembly::I64RegClass);
-      }
-      break;
-    default:
-      break;
+      case 'r':
+        assert(VT != MVT::iPTR && "Pointer MVT not expected here");
+        if (VT.isInteger() && !VT.isVector()) {
+          if (VT.getSizeInBits() <= 32)
+            return std::make_pair(0U, &WebAssembly::I32RegClass);
+          if (VT.getSizeInBits() <= 64)
+            return std::make_pair(0U, &WebAssembly::I64RegClass);
+        }
+        break;
+      default:
+        break;
     }
   }
 
@@ -274,17 +222,33 @@ bool WebAssemblyTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   // WebAssembly offsets are added as unsigned without wrapping. The
   // isLegalAddressingMode gives us no way to determine if wrapping could be
   // happening, so we approximate this by accepting only non-negative offsets.
-  if (AM.BaseOffs < 0)
-    return false;
+  if (AM.BaseOffs < 0) return false;
 
   // WebAssembly has no scale register operands.
-  if (AM.Scale != 0)
-    return false;
+  if (AM.Scale != 0) return false;
 
   // Everything else is legal.
   return true;
 }
 
+bool WebAssemblyTargetLowering::allowsMisalignedMemoryAccesses(
+    EVT /*VT*/, unsigned /*AddrSpace*/, unsigned /*Align*/, bool *Fast) const {
+  // WebAssembly supports unaligned accesses, though it should be declared
+  // with the p2align attribute on loads and stores which do so, and there
+  // may be a performance impact. We tell LLVM they're "fast" because
+  // for the kinds of things that LLVM uses this for (merging adjacent stores
+  // of constants, etc.), WebAssembly implementations will either want the
+  // unaligned access or they'll split anyway.
+  if (Fast) *Fast = true;
+  return true;
+}
+
+bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+  // The current thinking is that wasm engines will perform this optimization,
+  // so we can save on code size.
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // WebAssembly Lowering private implementation.
 //===----------------------------------------------------------------------===//
@@ -293,10 +257,10 @@ bool WebAssemblyTargetLowering::isLegalAddressingMode(const DataLayout &DL,
 // Lowering Code
 //===----------------------------------------------------------------------===//
 
-static void fail(SDLoc DL, SelectionDAG &DAG, const char *msg) {
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *msg) {
   MachineFunction &MF = DAG.getMachineFunction();
   DAG.getContext()->diagnose(
-      DiagnosticInfoUnsupported(DL, *MF.getFunction(), msg, SDValue()));
+      DiagnosticInfoUnsupported(*MF.getFunction(), msg, DL.getDebugLoc()));
 }
 
 // Test whether the given calling convention is supported.
@@ -312,14 +276,14 @@ static bool CallingConvSupported(CallingConv::ID CallConv) {
          CallConv == CallingConv::CXX_FAST_TLS;
 }
 
-SDValue
-WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
-                                     SmallVectorImpl<SDValue> &InVals) const {
+SDValue WebAssemblyTargetLowering::LowerCall(
+    CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
   SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
   MachineFunction &MF = DAG.getMachineFunction();
+  auto Layout = MF.getDataLayout();
 
   CallingConv::ID CallConv = CLI.CallConv;
   if (!CallingConvSupported(CallConv))
@@ -337,16 +301,15 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
     fail(DL, DAG, "WebAssembly doesn't support tail call yet");
   CLI.IsTailCall = false;
 
-  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
-
   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
   if (Ins.size() > 1)
     fail(DL, DAG, "WebAssembly doesn't support more than 1 returned value yet");
 
   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
-  for (const ISD::OutputArg &Out : Outs) {
-    if (Out.Flags.isByVal())
-      fail(DL, DAG, "WebAssembly hasn't implemented byval arguments");
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  for (unsigned i = 0; i < Outs.size(); ++i) {
+    const ISD::OutputArg &Out = Outs[i];
+    SDValue &OutVal = OutVals[i];
     if (Out.Flags.isNest())
       fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
     if (Out.Flags.isInAlloca())
@@ -355,28 +318,41 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
       fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
     if (Out.Flags.isInConsecutiveRegsLast())
       fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+    if (Out.Flags.isByVal() && Out.Flags.getByValSize() != 0) {
+      auto *MFI = MF.getFrameInfo();
+      int FI = MFI->CreateStackObject(Out.Flags.getByValSize(),
+                                      Out.Flags.getByValAlign(),
+                                      /*isSS=*/false);
+      SDValue SizeNode =
+          DAG.getConstant(Out.Flags.getByValSize(), DL, MVT::i32);
+      SDValue FINode = DAG.getFrameIndex(FI, getPointerTy(Layout));
+      Chain = DAG.getMemcpy(
+          Chain, DL, FINode, OutVal, SizeNode, Out.Flags.getByValAlign(),
+          /*isVolatile*/ false, /*AlwaysInline=*/false,
+          /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
+      OutVal = FINode;
+    }
   }
 
   bool IsVarArg = CLI.IsVarArg;
   unsigned NumFixedArgs = CLI.NumFixedArgs;
-  auto PtrVT = getPointerTy(MF.getDataLayout());
+
+  auto PtrVT = getPointerTy(Layout);
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
 
   if (IsVarArg) {
-    // Outgoing non-fixed arguments are placed at the top of the stack. First
-    // compute their offsets and the total amount of argument stack space
-    // needed.
+    // Outgoing non-fixed arguments are placed in a buffer. First
+    // compute their offsets and the total amount of buffer space needed.
     for (SDValue Arg :
          make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
       EVT VT = Arg.getValueType();
       assert(VT != MVT::iPTR && "Legalized args should be concrete");
       Type *Ty = VT.getTypeForEVT(*DAG.getContext());
-      unsigned Offset =
-          CCInfo.AllocateStack(MF.getDataLayout().getTypeAllocSize(Ty),
-                               MF.getDataLayout().getABITypeAlignment(Ty));
+      unsigned Offset = CCInfo.AllocateStack(Layout.getTypeAllocSize(Ty),
+                                             Layout.getABITypeAlignment(Ty));
       CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(),
                                         Offset, VT.getSimpleVT(),
                                         CCValAssign::Full));
@@ -385,17 +361,13 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
 
-  SDValue NB;
-  if (NumBytes) {
-    NB = DAG.getConstant(NumBytes, DL, PtrVT, true);
-    Chain = DAG.getCALLSEQ_START(Chain, NB, DL);
-  }
-
-  if (IsVarArg) {
+  SDValue FINode;
+  if (IsVarArg && NumBytes) {
     // For non-fixed arguments, next emit stores to store the argument values
-    // to the stack at the offsets computed above.
-    SDValue SP = DAG.getCopyFromReg(
-        Chain, DL, getStackPointerRegisterToSaveRestore(), PtrVT);
+    // to the stack buffer at the offsets computed above.
+    int FI = MF.getFrameInfo()->CreateStackObject(NumBytes,
+                                                  Layout.getStackAlignment(),
+                                                  /*isSS=*/false);
     unsigned ValNo = 0;
     SmallVector<SDValue, 8> Chains;
     for (SDValue Arg :
@@ -403,14 +375,17 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
       assert(ArgLocs[ValNo].getValNo() == ValNo &&
              "ArgLocs should remain in order and only hold varargs args");
       unsigned Offset = ArgLocs[ValNo++].getLocMemOffset();
-      SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, SP,
+      FINode = DAG.getFrameIndex(FI, getPointerTy(Layout));
+      SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, FINode,
                                 DAG.getConstant(Offset, DL, PtrVT));
-      Chains.push_back(DAG.getStore(Chain, DL, Arg, Add,
-                                    MachinePointerInfo::getStack(MF, Offset),
-                                    false, false, 0));
+      Chains.push_back(DAG.getStore(
+          Chain, DL, Arg, Add,
+          MachinePointerInfo::getFixedStack(MF, FI, Offset), 0));
     }
     if (!Chains.empty())
       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  } else if (IsVarArg) {
+    FINode = DAG.getIntPtrConstant(0, DL);
   }
 
   // Compute the operands for the CALLn node.
@@ -422,8 +397,10 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // isn't reliable.
   Ops.append(OutVals.begin(),
              IsVarArg ? OutVals.begin() + NumFixedArgs : OutVals.end());
+  // Add a pointer to the vararg buffer.
+  if (IsVarArg) Ops.push_back(FINode);
 
-  SmallVector<EVT, 8> Tys;
+  SmallVector<EVT, 8> InTys;
   for (const auto &In : Ins) {
     assert(!In.Flags.isByVal() && "byval is not valid for return values");
     assert(!In.Flags.isNest() && "nest is not valid for return values");
@@ -436,13 +413,13 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
            "WebAssembly hasn't implemented cons regs last return values");
     // Ignore In.getOrigAlign() because all our arguments are passed in
     // registers.
-    Tys.push_back(In.VT);
+    InTys.push_back(In.VT);
   }
-  Tys.push_back(MVT::Other);
-  SDVTList TyList = DAG.getVTList(Tys);
+  InTys.push_back(MVT::Other);
+  SDVTList InTyList = DAG.getVTList(InTys);
   SDValue Res =
       DAG.getNode(Ins.empty() ? WebAssemblyISD::CALL0 : WebAssemblyISD::CALL1,
-                  DL, TyList, Ops);
+                  DL, InTyList, Ops);
   if (Ins.empty()) {
     Chain = Res;
   } else {
@@ -450,11 +427,6 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = Res.getValue(1);
   }
 
-  if (NumBytes) {
-    SDValue Unused = DAG.getTargetConstant(0, DL, PtrVT);
-    Chain = DAG.getCALLSEQ_END(Chain, NB, Unused, SDValue(), DL);
-  }
-
   return Chain;
 }
 
@@ -469,7 +441,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
 SDValue WebAssemblyTargetLowering::LowerReturn(
     SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+    const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
     SelectionDAG &DAG) const {
   assert(Outs.size() <= 1 && "WebAssembly can only return up to one value");
   if (!CallingConvSupported(CallConv))
@@ -496,10 +468,11 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
 }
 
 SDValue WebAssemblyTargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  auto *MFI = MF.getInfo<WebAssemblyFunctionInfo>();
 
   if (!CallingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
@@ -509,8 +482,6 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
   MF.getRegInfo().addLiveIn(WebAssembly::ARGUMENTS);
 
   for (const ISD::InputArg &In : Ins) {
-    if (In.Flags.isByVal())
-      fail(DL, DAG, "WebAssembly hasn't implemented byval arguments");
     if (In.Flags.isInAlloca())
       fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
     if (In.Flags.isNest())
@@ -528,11 +499,22 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
             : DAG.getUNDEF(In.VT));
 
     // Record the number and types of arguments.
-    MF.getInfo<WebAssemblyFunctionInfo>()->addParam(In.VT);
+    MFI->addParam(In.VT);
   }
 
-  // Incoming varargs arguments are on the stack and will be accessed through
-  // va_arg, so we don't need to do anything for them here.
+  // Varargs are copied into a buffer allocated by the caller, and a pointer to
+  // the buffer is passed as an argument.
+  if (IsVarArg) {
+    MVT PtrVT = getPointerTy(MF.getDataLayout());
+    unsigned VarargVreg =
+        MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrVT));
+    MFI->setVarargBufferVreg(VarargVreg);
+    Chain = DAG.getCopyToReg(
+        Chain, DL, VarargVreg,
+        DAG.getNode(WebAssemblyISD::ARGUMENT, DL, PtrVT,
+                    DAG.getTargetConstant(Ins.size(), DL, MVT::i32)));
+    MFI->addParam(PtrVT);
+  }
 
   return Chain;
 }
@@ -543,31 +525,85 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
 
 SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
                                                   SelectionDAG &DAG) const {
+  SDLoc DL(Op);
   switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("unimplemented operation lowering");
-    return SDValue();
-  case ISD::FrameIndex:
-    return LowerFrameIndex(Op, DAG);
-  case ISD::GlobalAddress:
-    return LowerGlobalAddress(Op, DAG);
-  case ISD::ExternalSymbol:
-    return LowerExternalSymbol(Op, DAG);
-  case ISD::JumpTable:
-    return LowerJumpTable(Op, DAG);
-  case ISD::BR_JT:
-    return LowerBR_JT(Op, DAG);
-  case ISD::VASTART:
-    return LowerVASTART(Op, DAG);
+    default:
+      llvm_unreachable("unimplemented operation lowering");
+      return SDValue();
+    case ISD::FrameIndex:
+      return LowerFrameIndex(Op, DAG);
+    case ISD::GlobalAddress:
+      return LowerGlobalAddress(Op, DAG);
+    case ISD::ExternalSymbol:
+      return LowerExternalSymbol(Op, DAG);
+    case ISD::JumpTable:
+      return LowerJumpTable(Op, DAG);
+    case ISD::BR_JT:
+      return LowerBR_JT(Op, DAG);
+    case ISD::VASTART:
+      return LowerVASTART(Op, DAG);
+    case ISD::BlockAddress:
+    case ISD::BRIND:
+      fail(DL, DAG, "WebAssembly hasn't implemented computed gotos");
+      return SDValue();
+    case ISD::RETURNADDR: // Probably nothing meaningful can be returned here.
+      fail(DL, DAG, "WebAssembly hasn't implemented __builtin_return_address");
+      return SDValue();
+    case ISD::FRAMEADDR:
+      return LowerFRAMEADDR(Op, DAG);
+    case ISD::CopyToReg:
+      return LowerCopyToReg(Op, DAG);
   }
 }
 
+SDValue WebAssemblyTargetLowering::LowerCopyToReg(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(2);
+  if (isa<FrameIndexSDNode>(Src.getNode())) {
+    // CopyToReg nodes don't support FrameIndex operands. Other targets select
+    // the FI to some LEA-like instruction, but since we don't have that, we
+    // need to insert some kind of instruction that can take an FI operand and
+    // produces a value usable by CopyToReg (i.e. in a vreg). So insert a dummy
+    // copy_local between Op and its FI operand.
+    SDValue Chain = Op.getOperand(0);
+    SDLoc DL(Op);
+    unsigned Reg = cast<RegisterSDNode>(Op.getOperand(1))->getReg();
+    EVT VT = Src.getValueType();
+    SDValue Copy(
+        DAG.getMachineNode(VT == MVT::i32 ? WebAssembly::COPY_LOCAL_I32
+                                          : WebAssembly::COPY_LOCAL_I64,
+                           DL, VT, Src),
+        0);
+    return Op.getNode()->getNumValues() == 1
+               ? DAG.getCopyToReg(Chain, DL, Reg, Copy)
+               : DAG.getCopyToReg(Chain, DL, Reg, Copy, Op.getNumOperands() == 4
+                                                            ? Op.getOperand(3)
+                                                            : SDValue());
+  }
+  return SDValue();
+}
+
 SDValue WebAssemblyTargetLowering::LowerFrameIndex(SDValue Op,
                                                    SelectionDAG &DAG) const {
   int FI = cast<FrameIndexSDNode>(Op)->getIndex();
   return DAG.getTargetFrameIndex(FI, Op.getValueType());
 }
 
+SDValue WebAssemblyTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // Non-zero depths are not supported by WebAssembly currently. Use the
+  // legalizer's default expansion, which is to return 0 (what this function is
+  // documented to do).
+  if (Op.getConstantOperandVal(0) > 0)
+    return SDValue();
+
+  DAG.getMachineFunction().getFrameInfo()->setFrameAddressIsTaken(true);
+  EVT VT = Op.getValueType();
+  unsigned FP =
+      Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction());
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), FP, VT);
+}
+
 SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
                                                       SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -582,9 +618,8 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
       DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset()));
 }
 
-SDValue
-WebAssemblyTargetLowering::LowerExternalSymbol(SDValue Op,
-                                               SelectionDAG &DAG) const {
+SDValue WebAssemblyTargetLowering::LowerExternalSymbol(
+    SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   const auto *ES = cast<ExternalSymbolSDNode>(Op);
   EVT VT = Op.getValueType();
@@ -603,7 +638,7 @@ WebAssemblyTargetLowering::LowerExternalSymbol(SDValue Op,
 SDValue WebAssemblyTargetLowering::LowerJumpTable(SDValue Op,
                                                   SelectionDAG &DAG) const {
   // There's no need for a Wrapper node because we always incorporate a jump
-  // table operand into a TABLESWITCH instruction, rather than ever
+  // table operand into a BR_TABLE instruction, rather than ever
   // materializing it in a register.
   const JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   return DAG.getTargetJumpTable(JT->getIndex(), Op.getValueType(),
@@ -625,16 +660,15 @@ SDValue WebAssemblyTargetLowering::LowerBR_JT(SDValue Op,
   MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo();
   const auto &MBBs = MJTI->getJumpTables()[JT->getIndex()].MBBs;
 
+  // Add an operand for each case.
+  for (auto MBB : MBBs) Ops.push_back(DAG.getBasicBlock(MBB));
+
   // TODO: For now, we just pick something arbitrary for a default case for now.
   // We really want to sniff out the guard and put in the real default case (and
   // delete the guard).
   Ops.push_back(DAG.getBasicBlock(MBBs[0]));
 
-  // Add an operand for each case.
-  for (auto MBB : MBBs)
-    Ops.push_back(DAG.getBasicBlock(MBB));
-
-  return DAG.getNode(WebAssemblyISD::TABLESWITCH, DL, MVT::Other, Ops);
+  return DAG.getNode(WebAssemblyISD::BR_TABLE, DL, MVT::Other, Ops);
 }
 
 SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
@@ -642,16 +676,13 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
   SDLoc DL(Op);
   EVT PtrVT = getPointerTy(DAG.getMachineFunction().getDataLayout());
 
-  // The incoming non-fixed arguments are placed on the top of the stack, with
-  // natural alignment, at the point of the call, so the base pointer is just
-  // the current frame pointer.
-  DAG.getMachineFunction().getFrameInfo()->setFrameAddressIsTaken(true);
-  unsigned FP =
-      Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction());
-  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FP, PtrVT);
+  auto *MFI = DAG.getMachineFunction().getInfo<WebAssemblyFunctionInfo>();
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), DL, FrameAddr, Op.getOperand(1),
-                      MachinePointerInfo(SV), false, false, 0);
+
+  SDValue ArgN = DAG.getCopyFromReg(DAG.getEntryNode(), DL,
+                                    MFI->getVarargBufferVreg(), PtrVT);
+  return DAG.getStore(Op.getOperand(0), DL, ArgN, Op.getOperand(1),
+                      MachinePointerInfo(SV), 0);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index e7232a042e12..5bc723028e63 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -29,17 +29,17 @@ enum NodeType : unsigned {
 #undef HANDLE_NODETYPE
 };
 
-} // end namespace WebAssemblyISD
+}  // end namespace WebAssemblyISD
 
 class WebAssemblySubtarget;
 class WebAssemblyTargetMachine;
 
 class WebAssemblyTargetLowering final : public TargetLowering {
-public:
+ public:
   WebAssemblyTargetLowering(const TargetMachine &TM,
                             const WebAssemblySubtarget &STI);
 
-private:
+ private:
   /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
   /// right decision when generating code for different targets.
   const WebAssemblySubtarget *Subtarget;
@@ -49,13 +49,16 @@ private:
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
   MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
   const char *getTargetNodeName(unsigned Opcode) const override;
-  std::pair<unsigned, const TargetRegisterClass *>
-  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                               StringRef Constraint, MVT VT) const override;
+  std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(
+      const TargetRegisterInfo *TRI, StringRef Constraint,
+      MVT VT) const override;
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
+  bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, unsigned Align,
+                                      bool *Fast) const override;
+  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
@@ -65,29 +68,31 @@ private:
                       LLVMContext &Context) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
                       SelectionDAG &DAG) const override;
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool IsVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                               SDLoc DL, SelectionDAG &DAG,
+                               const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
   // Custom lowering hooks.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
 };
 
 namespace WebAssembly {
 FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                          const TargetLibraryInfo *libInfo);
-} // end namespace WebAssembly
+}  // end namespace WebAssembly
 
-} // end namespace llvm
+}  // end namespace llvm
 
 #endif
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index fda95953db81..444e275c6ebf 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -16,12 +16,12 @@ let Defs = [ARGUMENTS] in {
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
 // The condition operand is a boolean value which WebAssembly represents as i32.
-def BR_IF : I<(outs), (ins I32:$cond, bb_op:$dst),
+def BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
               [(brcond I32:$cond, bb:$dst)],
-               "br_if   \t$cond, $dst">;
+               "br_if   \t$dst, $cond">;
 let isCodeGenOnly = 1 in
-def BR_UNLESS : I<(outs), (ins I32:$cond, bb_op:$dst), [],
-                   "br_unless\t$cond, $dst">;
+def BR_UNLESS : I<(outs), (ins bb_op:$dst, I32:$cond), [],
+                   "br_unless\t$dst, $cond">;
 let isBarrier = 1 in {
 def BR   : I<(outs), (ins bb_op:$dst),
              [(br bb:$dst)],
@@ -32,27 +32,27 @@ def BR   : I<(outs), (ins bb_op:$dst),
 } // Defs = [ARGUMENTS]
 
 def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
-          (BR_IF I32:$cond, bb_op:$dst)>;
+          (BR_IF bb_op:$dst, I32:$cond)>;
 def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
-          (BR_UNLESS I32:$cond, bb_op:$dst)>;
+          (BR_UNLESS bb_op:$dst, I32:$cond)>;
 
 let Defs = [ARGUMENTS] in {
 
 // TODO: SelectionDAG's lowering insists on using a pointer as the index for
-// jump tables, so in practice we don't ever use TABLESWITCH_I64 in wasm32 mode
+// jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
 // currently.
 // Set TSFlags{0} to 1 to indicate that the variable_ops are immediates.
 // Set TSFlags{1} to 1 to indicate that the immediates represent labels.
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
-def TABLESWITCH_I32 : I<(outs), (ins I32:$index, bb_op:$default, variable_ops),
-                        [(WebAssemblytableswitch I32:$index, bb:$default)],
-                        "tableswitch\t$index, $default"> {
+def BR_TABLE_I32 : I<(outs), (ins I32:$index, variable_ops),
+                     [(WebAssemblybr_table I32:$index)],
+                     "br_table \t$index"> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
-def TABLESWITCH_I64 : I<(outs), (ins I64:$index, bb_op:$default, variable_ops),
-                        [(WebAssemblytableswitch I64:$index, bb:$default)],
-                        "tableswitch\t$index, $default"> {
+def BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
+                     [(WebAssemblybr_table I64:$index)],
+                     "br_table \t$index"> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
@@ -71,6 +71,10 @@ def END_LOOP  : I<(outs), (ins), [], "end_loop">;
 multiclass RETURN<WebAssemblyRegClass vt> {
   def RETURN_#vt : I<(outs), (ins vt:$val), [(WebAssemblyreturn vt:$val)],
                      "return  \t$val">;
+  // Equivalent to RETURN_#vt, for use at the end of a function when wasm
+  // semantics return by falling off the end of the block.
+  let isCodeGenOnly = 1 in
+  def FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), []>;
 }
 
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
@@ -80,6 +84,10 @@ let isReturn = 1 in {
   defm : RETURN<F32>;
   defm : RETURN<F64>;
   def RETURN_VOID : I<(outs), (ins), [(WebAssemblyreturn)], "return">;
+
+  // This is to RETURN_VOID what FALLTHROUGH_RETURN_#vt is to RETURN_#vt.
+  let isCodeGenOnly = 1 in
+  def FALLTHROUGH_RETURN_VOID : I<(outs), (ins), []>;
 } // isReturn = 1
   def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable">;
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 5520c6de6732..64569720375c 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -77,12 +77,12 @@ def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>;
 
 let Defs = [ARGUMENTS] in {
 
-def SELECT_F32 : I<(outs F32:$dst), (ins I32:$cond, F32:$lhs, F32:$rhs),
+def SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond),
                    [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
-                   "f32.select\t$dst, $cond, $lhs, $rhs">;
-def SELECT_F64 : I<(outs F64:$dst), (ins I32:$cond, F64:$lhs, F64:$rhs),
+                   "f32.select\t$dst, $lhs, $rhs, $cond">;
+def SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond),
                    [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
-                   "f64.select\t$dst, $cond, $lhs, $rhs">;
+                   "f64.select\t$dst, $lhs, $rhs, $cond">;
 
 } // Defs = [ARGUMENTS]
 
@@ -90,12 +90,12 @@ def SELECT_F64 : I<(outs F64:$dst), (ins I32:$cond, F64:$lhs, F64:$rhs),
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
 def : Pat<(select (i32 (setne I32:$cond, 0)), F32:$lhs, F32:$rhs),
-          (SELECT_F32 I32:$cond, F32:$lhs, F32:$rhs)>;
+          (SELECT_F32 F32:$lhs, F32:$rhs, I32:$cond)>;
 def : Pat<(select (i32 (setne I32:$cond, 0)), F64:$lhs, F64:$rhs),
-          (SELECT_F64 I32:$cond, F64:$lhs, F64:$rhs)>;
+          (SELECT_F64 F64:$lhs, F64:$rhs, I32:$cond)>;
 
 // And again, this time with seteq instead of setne and the arms reversed.
 def : Pat<(select (i32 (seteq I32:$cond, 0)), F32:$lhs, F32:$rhs),
-          (SELECT_F32 I32:$cond, F32:$rhs, F32:$lhs)>;
+          (SELECT_F32 F32:$rhs, F32:$lhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), F64:$lhs, F64:$rhs),
-          (SELECT_F64 I32:$cond, F64:$rhs, F64:$lhs)>;
+          (SELECT_F64 F64:$rhs, F64:$lhs, I32:$cond)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 028e9af0834f..2fd3eab99d78 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -15,6 +15,7 @@
 
 #include "WebAssemblyInstrInfo.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -32,16 +33,32 @@ WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
                               WebAssembly::ADJCALLSTACKUP),
       RI(STI.getTargetTriple()) {}
 
+bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable(
+    const MachineInstr &MI, AliasAnalysis *AA) const {
+  switch (MI.getOpcode()) {
+  case WebAssembly::CONST_I32:
+  case WebAssembly::CONST_I64:
+  case WebAssembly::CONST_F32:
+  case WebAssembly::CONST_F64:
+    // isReallyTriviallyReMaterializableGeneric misses these because of the
+    // ARGUMENTS implicit def, so we manualy override it here.
+    return true;
+  default:
+    return false;
+  }
+}
+
 void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
-                                       DebugLoc DL, unsigned DestReg,
+                                       const DebugLoc &DL, unsigned DestReg,
                                        unsigned SrcReg, bool KillSrc) const {
   // This method is called by post-RA expansion, which expects only pregs to
   // exist. However we need to handle both here.
   auto &MRI = MBB.getParent()->getRegInfo();
-  const TargetRegisterClass *RC = TargetRegisterInfo::isVirtualRegister(DestReg) ?
-      MRI.getRegClass(DestReg) :
-      MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(SrcReg);
+  const TargetRegisterClass *RC =
+      TargetRegisterInfo::isVirtualRegister(DestReg)
+          ? MRI.getRegClass(DestReg)
+          : MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(DestReg);
 
   unsigned CopyLocalOpcode;
   if (RC == &WebAssembly::I32RegClass)
@@ -59,8 +76,23 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       .addReg(SrcReg, KillSrc ? RegState::Kill : 0);
 }
 
+MachineInstr *
+WebAssemblyInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                             unsigned OpIdx1,
+                                             unsigned OpIdx2) const {
+  // If the operands are stackified, we can't reorder them.
+  WebAssemblyFunctionInfo &MFI =
+      *MI.getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>();
+  if (MFI.isVRegStackified(MI.getOperand(OpIdx1).getReg()) ||
+      MFI.isVRegStackified(MI.getOperand(OpIdx2).getReg()))
+    return nullptr;
+
+  // Otherwise use the default implementation.
+  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
 // Branch analysis.
-bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+bool WebAssemblyInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                          MachineBasicBlock *&TBB,
                                          MachineBasicBlock *&FBB,
                                          SmallVectorImpl<MachineOperand> &Cond,
@@ -75,22 +107,22 @@ bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       if (HaveCond)
         return true;
       // If we're running after CFGStackify, we can't optimize further.
-      if (!MI.getOperand(1).isMBB())
+      if (!MI.getOperand(0).isMBB())
         return true;
       Cond.push_back(MachineOperand::CreateImm(true));
-      Cond.push_back(MI.getOperand(0));
-      TBB = MI.getOperand(1).getMBB();
+      Cond.push_back(MI.getOperand(1));
+      TBB = MI.getOperand(0).getMBB();
       HaveCond = true;
       break;
     case WebAssembly::BR_UNLESS:
       if (HaveCond)
         return true;
       // If we're running after CFGStackify, we can't optimize further.
-      if (!MI.getOperand(1).isMBB())
+      if (!MI.getOperand(0).isMBB())
         return true;
       Cond.push_back(MachineOperand::CreateImm(false));
-      Cond.push_back(MI.getOperand(0));
-      TBB = MI.getOperand(1).getMBB();
+      Cond.push_back(MI.getOperand(1));
+      TBB = MI.getOperand(0).getMBB();
       HaveCond = true;
       break;
     case WebAssembly::BR:
@@ -133,7 +165,7 @@ unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB,
                                             MachineBasicBlock *TBB,
                                             MachineBasicBlock *FBB,
                                             ArrayRef<MachineOperand> Cond,
-                                            DebugLoc DL) const {
+                                            const DebugLoc &DL) const {
   if (Cond.empty()) {
     if (!TBB)
       return 0;
@@ -145,13 +177,11 @@ unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB,
   assert(Cond.size() == 2 && "Expected a flag and a successor block");
 
   if (Cond[0].getImm()) {
-    BuildMI(&MBB, DL, get(WebAssembly::BR_IF))
-        .addOperand(Cond[1])
-        .addMBB(TBB);
+    BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).addOperand(Cond[1]);
   } else {
     BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS))
-        .addOperand(Cond[1])
-        .addMBB(TBB);
+        .addMBB(TBB)
+        .addOperand(Cond[1]);
   }
   if (!FBB)
     return 1;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index 5ddd9b36f243..d93f958ca4cd 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -34,18 +34,24 @@ public:
 
   const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; }
 
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                         AliasAnalysis *AA) const override;
+
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                       unsigned OpIdx1,
+                                       unsigned OpIdx2) const override;
 
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify = false) const override;
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
   bool
   ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 };
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 2e682a475471..4b319871cf16 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -30,7 +30,7 @@ def SDT_WebAssemblyCallSeqEnd :
     SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 def SDT_WebAssemblyCall0    : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
 def SDT_WebAssemblyCall1    : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>;
-def SDT_WebAssemblyTableswitch : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyBrTable  : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
 def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
 def SDT_WebAssemblyReturn   : SDTypeProfile<0, -1, []>;
 def SDT_WebAssemblyWrapper  : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
@@ -52,9 +52,9 @@ def WebAssemblycall0 : SDNode<"WebAssemblyISD::CALL0",
 def WebAssemblycall1 : SDNode<"WebAssemblyISD::CALL1",
                               SDT_WebAssemblyCall1,
                               [SDNPHasChain, SDNPVariadic]>;
-def WebAssemblytableswitch : SDNode<"WebAssemblyISD::TABLESWITCH",
-                                    SDT_WebAssemblyTableswitch,
-                                    [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblybr_table : SDNode<"WebAssemblyISD::BR_TABLE",
+                                 SDT_WebAssemblyBrTable,
+                                 [SDNPHasChain, SDNPVariadic]>;
 def WebAssemblyargument : SDNode<"WebAssemblyISD::ARGUMENT",
                                  SDT_WebAssemblyArgument>;
 def WebAssemblyreturn   : SDNode<"WebAssemblyISD::RETURN",
@@ -71,10 +71,17 @@ let OperandNamespace = "WebAssembly" in {
 let OperandType = "OPERAND_BASIC_BLOCK" in
 def bb_op : Operand<OtherVT>;
 
-let OperandType = "OPERAND_FPIMM" in {
+let OperandType = "OPERAND_FP32IMM" in
 def f32imm_op : Operand<f32>;
+
+let OperandType = "OPERAND_FP64IMM" in
 def f64imm_op : Operand<f64>;
-} // OperandType = "OPERAND_FPIMM"
+
+let OperandType = "OPERAND_P2ALIGN" in {
+def P2Align : Operand<i32> {
+  let PrintMethod = "printWebAssemblyP2AlignOperand";
+}
+} // OperandType = "OPERAND_P2ALIGN"
 
 } // OperandNamespace = "WebAssembly"
 
@@ -101,15 +108,9 @@ defm : ARGUMENT<F64>;
 let Defs = [ARGUMENTS] in {
 
 // get_local and set_local are not generated by instruction selection; they
-// are implied by virtual register uses and defs in most contexts. However,
-// they are explicitly emitted for special purposes.
+// are implied by virtual register uses and defs.
 multiclass LOCAL<WebAssemblyRegClass vt> {
-  def GET_LOCAL_#vt : I<(outs vt:$res), (ins i32imm:$regno), [],
-                        "get_local\t$res, $regno">;
-  // TODO: set_local returns its operand value
-  def SET_LOCAL_#vt : I<(outs), (ins i32imm:$regno, vt:$src), [],
-                        "set_local\t$regno, $src">;
-
+let hasSideEffects = 0 in {
   // COPY_LOCAL is not an actual instruction in wasm, but since we allow
   // get_local and set_local to be implicit, we can have a COPY_LOCAL which
   // is actually a no-op because all the work is done in the implied
@@ -117,13 +118,21 @@ multiclass LOCAL<WebAssemblyRegClass vt> {
   let isAsCheapAsAMove = 1 in
   def COPY_LOCAL_#vt : I<(outs vt:$res), (ins vt:$src), [],
                          "copy_local\t$res, $src">;
+
+  // TEE_LOCAL is similar to COPY_LOCAL, but writes two copies of its result.
+  // Typically this would be used to stackify one result and write the other
+  // result to a local.
+  let isAsCheapAsAMove = 1 in
+  def TEE_LOCAL_#vt : I<(outs vt:$res, vt:$also), (ins vt:$src), [],
+                        "tee_local\t$res, $also, $src">;
+} // hasSideEffects = 0
 }
 defm : LOCAL<I32>;
 defm : LOCAL<I64>;
 defm : LOCAL<F32>;
 defm : LOCAL<F64>;
 
-let isMoveImm = 1 in {
+let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
 def CONST_I32 : I<(outs I32:$res), (ins i32imm:$imm),
                   [(set I32:$res, imm:$imm)],
                   "i32.const\t$res, $imm">;
@@ -136,7 +145,7 @@ def CONST_F32 : I<(outs F32:$res), (ins f32imm_op:$imm),
 def CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
                   [(set F64:$res, fpimm:$imm)],
                   "f64.const\t$res, $imm">;
-} // isMoveImm = 1
+} // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
 
 } // Defs = [ARGUMENTS]
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 09e5eafb85e9..7eaa57bb217e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -36,6 +36,8 @@ defm XOR : BinaryInt<xor, "xor ">;
 defm SHL : BinaryInt<shl, "shl ">;
 defm SHR_U : BinaryInt<srl, "shr_u">;
 defm SHR_S : BinaryInt<sra, "shr_s">;
+defm ROTL : BinaryInt<rotl, "rotl">;
+defm ROTR : BinaryInt<rotr, "rotr">;
 
 let isCommutable = 1 in {
 defm EQ : ComparisonInt<SETEQ, "eq  ">;
@@ -54,22 +56,29 @@ defm CLZ : UnaryInt<ctlz, "clz ">;
 defm CTZ : UnaryInt<cttz, "ctz ">;
 defm POPCNT : UnaryInt<ctpop, "popcnt">;
 
+def EQZ_I32 : I<(outs I32:$dst), (ins I32:$src),
+                [(set I32:$dst, (setcc I32:$src, 0, SETEQ))],
+                "i32.eqz \t$dst, $src">;
+def EQZ_I64 : I<(outs I32:$dst), (ins I64:$src),
+                [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
+                "i64.eqz \t$dst, $src">;
+
 } // Defs = [ARGUMENTS]
 
-// Expand the "don't care" operations to supported operations.
-def : Pat<(ctlz_zero_undef I32:$src), (CLZ_I32 I32:$src)>;
-def : Pat<(ctlz_zero_undef I64:$src), (CLZ_I64 I64:$src)>;
-def : Pat<(cttz_zero_undef I32:$src), (CTZ_I32 I32:$src)>;
-def : Pat<(cttz_zero_undef I64:$src), (CTZ_I64 I64:$src)>;
+// Optimize away an explicit mask on a rotate count.
+def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>;
+def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
 
 let Defs = [ARGUMENTS] in {
 
-def SELECT_I32 : I<(outs I32:$dst), (ins I32:$cond, I32:$lhs, I32:$rhs),
+def SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
                    [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
-                   "i32.select\t$dst, $cond, $lhs, $rhs">;
-def SELECT_I64 : I<(outs I64:$dst), (ins I32:$cond, I64:$lhs, I64:$rhs),
+                   "i32.select\t$dst, $lhs, $rhs, $cond">;
+def SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
                    [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
-                   "i64.select\t$dst, $cond, $lhs, $rhs">;
+                   "i64.select\t$dst, $lhs, $rhs, $cond">;
 
 } // Defs = [ARGUMENTS]
 
@@ -77,12 +86,12 @@ def SELECT_I64 : I<(outs I64:$dst), (ins I32:$cond, I64:$lhs, I64:$rhs),
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
 def : Pat<(select (i32 (setne I32:$cond, 0)), I32:$lhs, I32:$rhs),
-          (SELECT_I32 I32:$cond, I32:$lhs, I32:$rhs)>;
+          (SELECT_I32 I32:$lhs, I32:$rhs, I32:$cond)>;
 def : Pat<(select (i32 (setne I32:$cond, 0)), I64:$lhs, I64:$rhs),
-          (SELECT_I64 I32:$cond, I64:$lhs, I64:$rhs)>;
+          (SELECT_I64 I64:$lhs, I64:$rhs, I32:$cond)>;
 
 // And again, this time with seteq instead of setne and the arms reversed.
 def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs),
-          (SELECT_I32 I32:$cond, I32:$rhs, I32:$lhs)>;
+          (SELECT_I32 I32:$rhs, I32:$lhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs),
-          (SELECT_I64 I32:$cond, I64:$rhs, I64:$lhs)>;
+          (SELECT_I64 I64:$rhs, I64:$lhs, I32:$cond)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index b39ac5212f87..521c664ca4ab 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -28,6 +28,18 @@ def regPlusImm : PatFrag<(ops node:$addr, node:$off),
                          (add node:$addr, node:$off),
                          [{ return N->getFlags()->hasNoUnsignedWrap(); }]>;
 
+// Treat an 'or' node as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+  APInt KnownZero0, KnownOne0;
+  CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+  APInt KnownZero1, KnownOne1;
+  CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+  return (~KnownZero0 & ~KnownZero1) == 0;
+}]>;
+
 // GlobalAddresses are conceptually unsigned values, so we can also fold them
 // into immediate values as long as their offsets are non-negative.
 def regPlusGA : PatFrag<(ops node:$addr, node:$off),
@@ -46,325 +58,392 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off),
 let Defs = [ARGUMENTS] in {
 
 // Basic load.
-def LOAD_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
-                 "i32.load\t$dst, ${off}(${addr})">;
-def LOAD_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                 "i64.load\t$dst, ${off}(${addr})">;
-def LOAD_F32 : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr), [],
-                 "f32.load\t$dst, ${off}(${addr})">;
-def LOAD_F64 : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr), [],
-                 "f64.load\t$dst, ${off}(${addr})">;
+def LOAD_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                   P2Align:$p2align), [],
+                 "i32.load\t$dst, ${off}(${addr})${p2align}">;
+def LOAD_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                   P2Align:$p2align), [],
+                 "i64.load\t$dst, ${off}(${addr})${p2align}">;
+def LOAD_F32 : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr,
+                                   P2Align:$p2align), [],
+                 "f32.load\t$dst, ${off}(${addr})${p2align}">;
+def LOAD_F64 : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr,
+                                   P2Align:$p2align), [],
+                 "f64.load\t$dst, ${off}(${addr})${p2align}">;
 
 } // Defs = [ARGUMENTS]
 
 // Select loads with no constant offset.
-def : Pat<(i32 (load I32:$addr)), (LOAD_I32 0, $addr)>;
-def : Pat<(i64 (load I32:$addr)), (LOAD_I64 0, $addr)>;
-def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, $addr)>;
-def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, $addr)>;
+def : Pat<(i32 (load I32:$addr)), (LOAD_I32 0, $addr, 0)>;
+def : Pat<(i64 (load I32:$addr)), (LOAD_I64 0, $addr, 0)>;
+def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, $addr, 0)>;
+def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, $addr, 0)>;
 
 // Select loads with a constant offset.
 def : Pat<(i32 (load (regPlusImm I32:$addr, imm:$off))),
-          (LOAD_I32 imm:$off, $addr)>;
+          (LOAD_I32 imm:$off, $addr, 0)>;
 def : Pat<(i64 (load (regPlusImm I32:$addr, imm:$off))),
-          (LOAD_I64 imm:$off, $addr)>;
+          (LOAD_I64 imm:$off, $addr, 0)>;
 def : Pat<(f32 (load (regPlusImm I32:$addr, imm:$off))),
-          (LOAD_F32 imm:$off, $addr)>;
+          (LOAD_F32 imm:$off, $addr, 0)>;
 def : Pat<(f64 (load (regPlusImm I32:$addr, imm:$off))),
-          (LOAD_F64 imm:$off, $addr)>;
+          (LOAD_F64 imm:$off, $addr, 0)>;
+def : Pat<(i32 (load (or_is_add I32:$addr, imm:$off))),
+          (LOAD_I32 imm:$off, $addr, 0)>;
+def : Pat<(i64 (load (or_is_add I32:$addr, imm:$off))),
+          (LOAD_I64 imm:$off, $addr, 0)>;
+def : Pat<(f32 (load (or_is_add I32:$addr, imm:$off))),
+          (LOAD_F32 imm:$off, $addr, 0)>;
+def : Pat<(f64 (load (or_is_add I32:$addr, imm:$off))),
+          (LOAD_F64 imm:$off, $addr, 0)>;
 def : Pat<(i32 (load (regPlusGA I32:$addr,
                                 (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD_I32 tglobaladdr:$off, $addr)>;
+          (LOAD_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (load (regPlusGA I32:$addr,
                                 (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD_I64 tglobaladdr:$off, $addr)>;
+          (LOAD_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(f32 (load (regPlusGA I32:$addr,
                                 (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD_F32 tglobaladdr:$off, $addr)>;
+          (LOAD_F32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(f64 (load (regPlusGA I32:$addr,
                                 (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD_F64 tglobaladdr:$off, $addr)>;
+          (LOAD_F64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD_I32 texternalsym:$off, $addr)>;
+          (LOAD_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD_I64 texternalsym:$off, $addr)>;
+          (LOAD_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(f32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD_F32 texternalsym:$off, $addr)>;
+          (LOAD_F32 texternalsym:$off, $addr, 0)>;
 def : Pat<(f64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD_F64 texternalsym:$off, $addr)>;
+          (LOAD_F64 texternalsym:$off, $addr, 0)>;
 
 // Select loads with just a constant offset.
-def : Pat<(i32 (load imm:$off)), (LOAD_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (load imm:$off)), (LOAD_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(f32 (load imm:$off)), (LOAD_F32 imm:$off, (CONST_I32 0))>;
-def : Pat<(f64 (load imm:$off)), (LOAD_F64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (load imm:$off)), (LOAD_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (load imm:$off)), (LOAD_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(f32 (load imm:$off)), (LOAD_F32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(f64 (load imm:$off)), (LOAD_F64 imm:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (load (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (load (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(f32 (load (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD_F32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD_F32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(f64 (load (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD_F64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD_F64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (load (WebAssemblywrapper texternalsym:$off))),
-          (LOAD_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (load (WebAssemblywrapper texternalsym:$off))),
-          (LOAD_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(f32 (load (WebAssemblywrapper texternalsym:$off))),
-          (LOAD_F32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD_F32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(f64 (load (WebAssemblywrapper texternalsym:$off))),
-          (LOAD_F64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD_F64 texternalsym:$off, (CONST_I32 0), 0)>;
 
 let Defs = [ARGUMENTS] in {
 
 // Extending load.
-def LOAD8_S_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i32.load8_s\t$dst, ${off}(${addr})">;
-def LOAD8_U_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i32.load8_u\t$dst, ${off}(${addr})">;
-def LOAD16_S_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i32.load16_s\t$dst, ${off}(${addr})">;
-def LOAD16_U_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i32.load16_u\t$dst, ${off}(${addr})">;
-def LOAD8_S_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i64.load8_s\t$dst, ${off}(${addr})">;
-def LOAD8_U_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i64.load8_u\t$dst, ${off}(${addr})">;
-def LOAD16_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i64.load16_s\t$dst, ${off}(${addr})">;
-def LOAD16_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i64.load16_u\t$dst, ${off}(${addr})">;
-def LOAD32_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i64.load32_s\t$dst, ${off}(${addr})">;
-def LOAD32_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [],
-                     "i64.load32_u\t$dst, ${off}(${addr})">;
+def LOAD8_S_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i32.load8_s\t$dst, ${off}(${addr})${p2align}">;
+def LOAD8_U_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i32.load8_u\t$dst, ${off}(${addr})${p2align}">;
+def LOAD16_S_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i32.load16_s\t$dst, ${off}(${addr})${p2align}">;
+def LOAD16_U_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i32.load16_u\t$dst, ${off}(${addr})${p2align}">;
+def LOAD8_S_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i64.load8_s\t$dst, ${off}(${addr})${p2align}">;
+def LOAD8_U_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i64.load8_u\t$dst, ${off}(${addr})${p2align}">;
+def LOAD16_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i64.load16_s\t$dst, ${off}(${addr})${p2align}">;
+def LOAD16_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i64.load16_u\t$dst, ${off}(${addr})${p2align}">;
+def LOAD32_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i64.load32_s\t$dst, ${off}(${addr})${p2align}">;
+def LOAD32_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                       P2Align:$p2align), [],
+                     "i64.load32_u\t$dst, ${off}(${addr})${p2align}">;
 
 } // Defs = [ARGUMENTS]
 
 // Select extending loads with no constant offset.
-def : Pat<(i32 (sextloadi8 I32:$addr)), (LOAD8_S_I32 0, $addr)>;
-def : Pat<(i32 (zextloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr)>;
-def : Pat<(i32 (sextloadi16 I32:$addr)), (LOAD16_S_I32 0, $addr)>;
-def : Pat<(i32 (zextloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>;
-def : Pat<(i64 (sextloadi8 I32:$addr)), (LOAD8_S_I64 0, $addr)>;
-def : Pat<(i64 (zextloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr)>;
-def : Pat<(i64 (sextloadi16 I32:$addr)), (LOAD16_S_I64 0, $addr)>;
-def : Pat<(i64 (zextloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
-def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, $addr)>;
-def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
+def : Pat<(i32 (sextloadi8 I32:$addr)), (LOAD8_S_I32 0, $addr, 0)>;
+def : Pat<(i32 (zextloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr, 0)>;
+def : Pat<(i32 (sextloadi16 I32:$addr)), (LOAD16_S_I32 0, $addr, 0)>;
+def : Pat<(i32 (zextloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr, 0)>;
+def : Pat<(i64 (sextloadi8 I32:$addr)), (LOAD8_S_I64 0, $addr, 0)>;
+def : Pat<(i64 (zextloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr, 0)>;
+def : Pat<(i64 (sextloadi16 I32:$addr)), (LOAD16_S_I64 0, $addr, 0)>;
+def : Pat<(i64 (zextloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr, 0)>;
+def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, $addr, 0)>;
+def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr, 0)>;
 
 // Select extending loads with a constant offset.
 def : Pat<(i32 (sextloadi8 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD8_S_I32 imm:$off, $addr)>;
+          (LOAD8_S_I32 imm:$off, $addr, 0)>;
 def : Pat<(i32 (zextloadi8 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD8_U_I32 imm:$off, $addr)>;
+          (LOAD8_U_I32 imm:$off, $addr, 0)>;
 def : Pat<(i32 (sextloadi16 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD16_S_I32 imm:$off, $addr)>;
+          (LOAD16_S_I32 imm:$off, $addr, 0)>;
 def : Pat<(i32 (zextloadi16 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD16_U_I32 imm:$off, $addr)>;
+          (LOAD16_U_I32 imm:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi8 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD8_S_I64 imm:$off, $addr)>;
+          (LOAD8_S_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi8 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD8_U_I64 imm:$off, $addr)>;
+          (LOAD8_U_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi16 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD16_S_I64 imm:$off, $addr)>;
+          (LOAD16_S_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi16 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD16_U_I64 imm:$off, $addr)>;
+          (LOAD16_U_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi32 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD32_S_I64 imm:$off, $addr)>;
+          (LOAD32_S_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi32 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD32_U_I64 imm:$off, $addr)>;
+          (LOAD32_U_I64 imm:$off, $addr, 0)>;
+def : Pat<(i32 (sextloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_S_I32 imm:$off, $addr, 0)>;
+def : Pat<(i32 (zextloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_U_I32 imm:$off, $addr, 0)>;
+def : Pat<(i32 (sextloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_S_I32 imm:$off, $addr, 0)>;
+def : Pat<(i32 (zextloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_U_I32 imm:$off, $addr, 0)>;
+def : Pat<(i64 (sextloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_S_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (zextloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_U_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (sextloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_S_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (zextloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_U_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (sextloadi32 (or_is_add I32:$addr, imm:$off))),
+          (LOAD32_S_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (zextloadi32 (or_is_add I32:$addr, imm:$off))),
+          (LOAD32_U_I64 imm:$off, $addr, 0)>;
 def : Pat<(i32 (sextloadi8 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD8_S_I32 tglobaladdr:$off, $addr)>;
+          (LOAD8_S_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (zextloadi8 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD8_U_I32 tglobaladdr:$off, $addr)>;
+          (LOAD8_U_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (sextloadi16 (regPlusGA I32:$addr,
                                        (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD16_S_I32 tglobaladdr:$off, $addr)>;
+          (LOAD16_S_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (zextloadi16 (regPlusGA I32:$addr,
                                        (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD16_U_I32 tglobaladdr:$off, $addr)>;
+          (LOAD16_U_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi8 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD8_S_I64 tglobaladdr:$off, $addr)>;
+          (LOAD8_S_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi8 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD8_U_I64 tglobaladdr:$off, $addr)>;
+          (LOAD8_U_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi16 (regPlusGA I32:$addr,
                                        (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD16_S_I64 tglobaladdr:$off, $addr)>;
+          (LOAD16_S_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi16 (regPlusGA I32:$addr,
                                        (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD16_U_I64 tglobaladdr:$off, $addr)>;
+          (LOAD16_U_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi32 (regPlusGA I32:$addr,
                                        (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD32_S_I64 tglobaladdr:$off, $addr)>;
+          (LOAD32_S_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi32 (regPlusGA I32:$addr,
                                        (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD32_U_I64 tglobaladdr:$off, $addr)>;
+          (LOAD32_U_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (sextloadi8 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD8_S_I32 texternalsym:$off, $addr)>;
+          (LOAD8_S_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i32 (zextloadi8 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD8_U_I32 texternalsym:$off, $addr)>;
+          (LOAD8_U_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i32 (sextloadi16 (add I32:$addr,
                                  (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD16_S_I32 texternalsym:$off, $addr)>;
+          (LOAD16_S_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i32 (zextloadi16 (add I32:$addr,
                                  (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD16_U_I32 texternalsym:$off, $addr)>;
+          (LOAD16_U_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi8 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD8_S_I64 texternalsym:$off, $addr)>;
+          (LOAD8_S_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi8 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD8_U_I64 texternalsym:$off, $addr)>;
+          (LOAD8_U_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi16 (add I32:$addr,
                                  (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD16_S_I64 texternalsym:$off, $addr)>;
+          (LOAD16_S_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi16 (add I32:$addr,
                                  (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD16_U_I64 texternalsym:$off, $addr)>;
+          (LOAD16_U_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (sextloadi32 (add I32:$addr,
                                  (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD32_S_I64 texternalsym:$off, $addr)>;
+          (LOAD32_S_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (zextloadi32 (add I32:$addr,
                                  (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD32_U_I64 texternalsym:$off, $addr)>;
+          (LOAD32_U_I64 texternalsym:$off, $addr, 0)>;
 
 // Select extending loads with just a constant offset.
-def : Pat<(i32 (sextloadi8 imm:$off)), (LOAD8_S_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i32 (zextloadi8 imm:$off)), (LOAD8_U_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i32 (sextloadi16 imm:$off)), (LOAD16_S_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i32 (zextloadi16 imm:$off)), (LOAD16_U_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (sextloadi8 imm:$off)), (LOAD8_S_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (zextloadi8 imm:$off)), (LOAD8_U_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (sextloadi16 imm:$off)), (LOAD16_S_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (zextloadi16 imm:$off)), (LOAD16_U_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (sextloadi32 imm:$off)), (LOAD32_S_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (zextloadi32 imm:$off)), (LOAD32_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi8 imm:$off)),
+          (LOAD8_S_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i32 (zextloadi8 imm:$off)),
+          (LOAD8_U_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i32 (sextloadi16 imm:$off)),
+          (LOAD16_S_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i32 (zextloadi16 imm:$off)),
+          (LOAD16_U_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (sextloadi8 imm:$off)),
+          (LOAD8_S_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (zextloadi8 imm:$off)),
+          (LOAD8_U_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (sextloadi16 imm:$off)),
+          (LOAD16_S_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (zextloadi16 imm:$off)),
+          (LOAD16_U_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (sextloadi32 imm:$off)),
+          (LOAD32_S_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (zextloadi32 imm:$off)),
+          (LOAD32_U_I64 imm:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD8_S_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD8_S_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD16_S_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD16_S_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD8_S_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD8_S_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD16_S_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD16_S_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (sextloadi32 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD32_S_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD32_S_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (zextloadi32 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (sextloadi8 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD8_S_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD8_S_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (zextloadi8 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (sextloadi16 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD16_S_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD16_S_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (zextloadi16 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (sextloadi8 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD8_S_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD8_S_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (zextloadi8 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (sextloadi16 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD16_S_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD16_S_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (zextloadi16 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (sextloadi32 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD32_S_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD32_S_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (zextloadi32 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD32_U_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD32_U_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 
 // Resolve "don't care" extending loads to zero-extending loads. This is
 // somewhat arbitrary, but zero-extending is conceptually simpler.
 
 // Select "don't care" extending loads with no constant offset.
-def : Pat<(i32 (extloadi8 I32:$addr)),  (LOAD8_U_I32 0, $addr)>;
-def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>;
-def : Pat<(i64 (extloadi8 I32:$addr)),  (LOAD8_U_I64 0, $addr)>;
-def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
-def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
+def : Pat<(i32 (extloadi8 I32:$addr)),  (LOAD8_U_I32 0, $addr, 0)>;
+def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr, 0)>;
+def : Pat<(i64 (extloadi8 I32:$addr)),  (LOAD8_U_I64 0, $addr, 0)>;
+def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr, 0)>;
+def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr, 0)>;
 
 // Select "don't care" extending loads with a constant offset.
 def : Pat<(i32 (extloadi8 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD8_U_I32 imm:$off, $addr)>;
+          (LOAD8_U_I32 imm:$off, $addr, 0)>;
 def : Pat<(i32 (extloadi16 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD16_U_I32 imm:$off, $addr)>;
+          (LOAD16_U_I32 imm:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi8 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD8_U_I64 imm:$off, $addr)>;
+          (LOAD8_U_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi16 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD16_U_I64 imm:$off, $addr)>;
+          (LOAD16_U_I64 imm:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi32 (regPlusImm I32:$addr, imm:$off))),
-          (LOAD32_U_I64 imm:$off, $addr)>;
+          (LOAD32_U_I64 imm:$off, $addr, 0)>;
+def : Pat<(i32 (extloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_U_I32 imm:$off, $addr, 0)>;
+def : Pat<(i32 (extloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_U_I32 imm:$off, $addr, 0)>;
+def : Pat<(i64 (extloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_U_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (extloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_U_I64 imm:$off, $addr, 0)>;
+def : Pat<(i64 (extloadi32 (or_is_add I32:$addr, imm:$off))),
+          (LOAD32_U_I64 imm:$off, $addr, 0)>;
 def : Pat<(i32 (extloadi8 (regPlusGA I32:$addr,
                                      (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD8_U_I32 tglobaladdr:$off, $addr)>;
+          (LOAD8_U_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (extloadi16 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD16_U_I32 tglobaladdr:$off, $addr)>;
+          (LOAD16_U_I32 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi8 (regPlusGA I32:$addr,
                                      (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD8_U_I64 tglobaladdr:$off, $addr)>;
+          (LOAD8_U_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi16 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD16_U_I64 tglobaladdr:$off, $addr)>;
+          (LOAD16_U_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi32 (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off)))),
-          (LOAD32_U_I64 tglobaladdr:$off, $addr)>;
+          (LOAD32_U_I64 tglobaladdr:$off, $addr, 0)>;
 def : Pat<(i32 (extloadi8 (add I32:$addr,
                                (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD8_U_I32 texternalsym:$off, $addr)>;
+          (LOAD8_U_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i32 (extloadi16 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD16_U_I32 texternalsym:$off, $addr)>;
+          (LOAD16_U_I32 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi8 (add I32:$addr,
                                (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD8_U_I64 texternalsym:$off, $addr)>;
+          (LOAD8_U_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi16 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD16_U_I64 texternalsym:$off, $addr)>;
+          (LOAD16_U_I64 texternalsym:$off, $addr, 0)>;
 def : Pat<(i64 (extloadi32 (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off)))),
-          (LOAD32_U_I64 texternalsym:$off, $addr)>;
+          (LOAD32_U_I64 texternalsym:$off, $addr, 0)>;
 
 // Select "don't care" extending loads with just a constant offset.
-def : Pat<(i32 (extloadi8 imm:$off)), (LOAD8_U_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i32 (extloadi16 imm:$off)), (LOAD16_U_I32 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (extloadi8 imm:$off)), (LOAD8_U_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (extloadi16 imm:$off)), (LOAD16_U_I64 imm:$off, (CONST_I32 0))>;
-def : Pat<(i64 (extloadi32 imm:$off)), (LOAD32_U_I64 imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi8 imm:$off)),
+          (LOAD8_U_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i32 (extloadi16 imm:$off)),
+          (LOAD16_U_I32 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (extloadi8 imm:$off)),
+          (LOAD8_U_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (extloadi16 imm:$off)),
+          (LOAD16_U_I64 imm:$off, (CONST_I32 0), 0)>;
+def : Pat<(i64 (extloadi32 imm:$off)),
+          (LOAD32_U_I64 imm:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (extloadi32 (WebAssemblywrapper tglobaladdr:$off))),
-          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (extloadi8 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i32 (extloadi16 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (extloadi8 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (extloadi16 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0))>;
+          (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0), 0)>;
 def : Pat<(i64 (extloadi32 (WebAssemblywrapper texternalsym:$off))),
-          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>;
+          (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0), 0)>;
 
 let Defs = [ARGUMENTS] in {
 
@@ -374,205 +453,232 @@ let Defs = [ARGUMENTS] in {
 // instruction definition patterns that don't reference all of the output
 // operands.
 // Note: WebAssembly inverts SelectionDAG's usual operand order.
-def STORE_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [],
-                   "i32.store\t$dst, ${off}(${addr}), $val">;
-def STORE_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
-                   "i64.store\t$dst, ${off}(${addr}), $val">;
-def STORE_F32  : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr, F32:$val), [],
-                   "f32.store\t$dst, ${off}(${addr}), $val">;
-def STORE_F64  : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr, F64:$val), [],
-                   "f64.store\t$dst, ${off}(${addr}), $val">;
+def STORE_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                     P2Align:$p2align, I32:$val), [],
+                   "i32.store\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                     P2Align:$p2align, I64:$val), [],
+                   "i64.store\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE_F32  : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr,
+                                     P2Align:$p2align, F32:$val), [],
+                   "f32.store\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE_F64  : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr,
+                                     P2Align:$p2align, F64:$val), [],
+                   "f64.store\t$dst, ${off}(${addr})${p2align}, $val">;
 
 } // Defs = [ARGUMENTS]
 
 // Select stores with no constant offset.
-def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, I32:$addr, 0, I32:$val)>;
+def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, I32:$addr, 0, I64:$val)>;
+def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, I32:$addr, 0, F32:$val)>;
+def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, I32:$addr, 0, F64:$val)>;
 
 // Select stores with a constant offset.
 def : Pat<(store I32:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE_I32 imm:$off, I32:$addr, I32:$val)>;
+          (STORE_I32 imm:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(store I64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE_I64 imm:$off, I32:$addr, I64:$val)>;
+          (STORE_I64 imm:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(store F32:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE_F32 imm:$off, I32:$addr, F32:$val)>;
+          (STORE_F32 imm:$off, I32:$addr, 0, F32:$val)>;
 def : Pat<(store F64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE_F64 imm:$off, I32:$addr, F64:$val)>;
+          (STORE_F64 imm:$off, I32:$addr, 0, F64:$val)>;
+def : Pat<(store I32:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE_I32 imm:$off, I32:$addr, 0, I32:$val)>;
+def : Pat<(store I64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE_I64 imm:$off, I32:$addr, 0, I64:$val)>;
+def : Pat<(store F32:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE_F32 imm:$off, I32:$addr, 0, F32:$val)>;
+def : Pat<(store F64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE_F64 imm:$off, I32:$addr, 0, F64:$val)>;
 def : Pat<(store I32:$val, (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
+          (STORE_I32 tglobaladdr:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(store I64:$val, (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+          (STORE_I64 tglobaladdr:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(store F32:$val, (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE_F32 tglobaladdr:$off, I32:$addr, F32:$val)>;
+          (STORE_F32 tglobaladdr:$off, I32:$addr, 0, F32:$val)>;
 def : Pat<(store F64:$val, (regPlusGA I32:$addr,
                                       (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE_F64 tglobaladdr:$off, I32:$addr, F64:$val)>;
+          (STORE_F64 tglobaladdr:$off, I32:$addr, 0, F64:$val)>;
 def : Pat<(store I32:$val, (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off))),
-          (STORE_I32 texternalsym:$off, I32:$addr, I32:$val)>;
+          (STORE_I32 texternalsym:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(store I64:$val, (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off))),
-          (STORE_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+          (STORE_I64 texternalsym:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(store F32:$val, (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off))),
-          (STORE_F32 texternalsym:$off, I32:$addr, F32:$val)>;
+          (STORE_F32 texternalsym:$off, I32:$addr, 0, F32:$val)>;
 def : Pat<(store F64:$val, (add I32:$addr,
                                 (WebAssemblywrapper texternalsym:$off))),
-          (STORE_F64 texternalsym:$off, I32:$addr, F64:$val)>;
+          (STORE_F64 texternalsym:$off, I32:$addr, 0, F64:$val)>;
 
 // Select stores with just a constant offset.
 def : Pat<(store I32:$val, imm:$off),
-          (STORE_I32 imm:$off, (CONST_I32 0), I32:$val)>;
+          (STORE_I32 imm:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(store I64:$val, imm:$off),
-          (STORE_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+          (STORE_I64 imm:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(store F32:$val, imm:$off),
-          (STORE_F32 imm:$off, (CONST_I32 0), F32:$val)>;
+          (STORE_F32 imm:$off, (CONST_I32 0), 0, F32:$val)>;
 def : Pat<(store F64:$val, imm:$off),
-          (STORE_F64 imm:$off, (CONST_I32 0), F64:$val)>;
+          (STORE_F64 imm:$off, (CONST_I32 0), 0, F64:$val)>;
 def : Pat<(store I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+          (STORE_I32 tglobaladdr:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(store I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+          (STORE_I64 tglobaladdr:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(store F32:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE_F32 tglobaladdr:$off, (CONST_I32 0), F32:$val)>;
+          (STORE_F32 tglobaladdr:$off, (CONST_I32 0), 0, F32:$val)>;
 def : Pat<(store F64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE_F64 tglobaladdr:$off, (CONST_I32 0), F64:$val)>;
+          (STORE_F64 tglobaladdr:$off, (CONST_I32 0), 0, F64:$val)>;
 def : Pat<(store I32:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>;
+          (STORE_I32 texternalsym:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(store I64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+          (STORE_I64 texternalsym:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(store F32:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE_F32 texternalsym:$off, (CONST_I32 0), F32:$val)>;
+          (STORE_F32 texternalsym:$off, (CONST_I32 0), 0, F32:$val)>;
 def : Pat<(store F64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE_F64 texternalsym:$off, (CONST_I32 0), F64:$val)>;
+          (STORE_F64 texternalsym:$off, (CONST_I32 0), 0, F64:$val)>;
 
 let Defs = [ARGUMENTS] in {
 
 // Truncating store.
-def STORE8_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [],
-                    "i32.store8\t$dst, ${off}(${addr}), $val">;
-def STORE16_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [],
-                    "i32.store16\t$dst, ${off}(${addr}), $val">;
-def STORE8_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
-                    "i64.store8\t$dst, ${off}(${addr}), $val">;
-def STORE16_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
-                    "i64.store16\t$dst, ${off}(${addr}), $val">;
-def STORE32_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [],
-                    "i64.store32\t$dst, ${off}(${addr}), $val">;
+def STORE8_I32  : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                      P2Align:$p2align, I32:$val), [],
+                    "i32.store8\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE16_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr,
+                                      P2Align:$p2align, I32:$val), [],
+                    "i32.store16\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE8_I64  : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                      P2Align:$p2align, I64:$val), [],
+                    "i64.store8\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE16_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                      P2Align:$p2align, I64:$val), [],
+                    "i64.store16\t$dst, ${off}(${addr})${p2align}, $val">;
+def STORE32_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr,
+                                      P2Align:$p2align, I64:$val), [],
+                    "i64.store32\t$dst, ${off}(${addr})${p2align}, $val">;
 
 } // Defs = [ARGUMENTS]
 
 // Select truncating stores with no constant offset.
 def : Pat<(truncstorei8 I32:$val, I32:$addr),
-          (STORE8_I32 0, I32:$addr, I32:$val)>;
+          (STORE8_I32 0, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val, I32:$addr),
-          (STORE16_I32 0, I32:$addr, I32:$val)>;
+          (STORE16_I32 0, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val, I32:$addr),
-          (STORE8_I64 0, I32:$addr, I64:$val)>;
+          (STORE8_I64 0, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val, I32:$addr),
-          (STORE16_I64 0, I32:$addr, I64:$val)>;
+          (STORE16_I64 0, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val, I32:$addr),
-          (STORE32_I64 0, I32:$addr, I64:$val)>;
+          (STORE32_I64 0, I32:$addr, 0, I64:$val)>;
 
 // Select truncating stores with a constant offset.
 def : Pat<(truncstorei8 I32:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE8_I32 imm:$off, I32:$addr, I32:$val)>;
+          (STORE8_I32 imm:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE16_I32 imm:$off, I32:$addr, I32:$val)>;
+          (STORE16_I32 imm:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE8_I64 imm:$off, I32:$addr, I64:$val)>;
+          (STORE8_I64 imm:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE16_I64 imm:$off, I32:$addr, I64:$val)>;
+          (STORE16_I64 imm:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val, (regPlusImm I32:$addr, imm:$off)),
-          (STORE32_I64 imm:$off, I32:$addr, I64:$val)>;
+          (STORE32_I64 imm:$off, I32:$addr, 0, I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE8_I32 imm:$off, I32:$addr, 0, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE16_I32 imm:$off, I32:$addr, 0, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE8_I64 imm:$off, I32:$addr, 0, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE16_I64 imm:$off, I32:$addr, 0, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE32_I64 imm:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei8 I32:$val,
                         (regPlusGA I32:$addr,
                                    (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE8_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
+          (STORE8_I32 tglobaladdr:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val,
                          (regPlusGA I32:$addr,
                                     (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE16_I32 tglobaladdr:$off, I32:$addr, I32:$val)>;
+          (STORE16_I32 tglobaladdr:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val,
                         (regPlusGA I32:$addr,
                                    (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE8_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+          (STORE8_I64 tglobaladdr:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val,
                          (regPlusGA I32:$addr,
                                     (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE16_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+          (STORE16_I64 tglobaladdr:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val,
                          (regPlusGA I32:$addr,
                                     (WebAssemblywrapper tglobaladdr:$off))),
-          (STORE32_I64 tglobaladdr:$off, I32:$addr, I64:$val)>;
+          (STORE32_I64 tglobaladdr:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei8 I32:$val, (add I32:$addr,
                                        (WebAssemblywrapper texternalsym:$off))),
-          (STORE8_I32 texternalsym:$off, I32:$addr, I32:$val)>;
+          (STORE8_I32 texternalsym:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val,
                          (add I32:$addr,
                               (WebAssemblywrapper texternalsym:$off))),
-          (STORE16_I32 texternalsym:$off, I32:$addr, I32:$val)>;
+          (STORE16_I32 texternalsym:$off, I32:$addr, 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val,
                         (add I32:$addr,
                              (WebAssemblywrapper texternalsym:$off))),
-          (STORE8_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+          (STORE8_I64 texternalsym:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val,
                          (add I32:$addr,
                               (WebAssemblywrapper texternalsym:$off))),
-          (STORE16_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+          (STORE16_I64 texternalsym:$off, I32:$addr, 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val,
                          (add I32:$addr,
                               (WebAssemblywrapper texternalsym:$off))),
-          (STORE32_I64 texternalsym:$off, I32:$addr, I64:$val)>;
+          (STORE32_I64 texternalsym:$off, I32:$addr, 0, I64:$val)>;
 
 // Select truncating stores with just a constant offset.
 def : Pat<(truncstorei8 I32:$val, imm:$off),
-          (STORE8_I32 imm:$off, (CONST_I32 0), I32:$val)>;
+          (STORE8_I32 imm:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val, imm:$off),
-          (STORE16_I32 imm:$off, (CONST_I32 0), I32:$val)>;
+          (STORE16_I32 imm:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val, imm:$off),
-          (STORE8_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+          (STORE8_I64 imm:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val, imm:$off),
-          (STORE16_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+          (STORE16_I64 imm:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val, imm:$off),
-          (STORE32_I64 imm:$off, (CONST_I32 0), I64:$val)>;
+          (STORE32_I64 imm:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE8_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+          (STORE8_I32 tglobaladdr:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE16_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+          (STORE16_I32 tglobaladdr:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE8_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+          (STORE8_I64 tglobaladdr:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE16_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+          (STORE16_I64 tglobaladdr:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
-          (STORE32_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+          (STORE32_I64 tglobaladdr:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE8_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>;
+          (STORE8_I32 texternalsym:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE16_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>;
+          (STORE16_I32 texternalsym:$off, (CONST_I32 0), 0, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE8_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+          (STORE8_I64 texternalsym:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE16_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+          (STORE16_I64 texternalsym:$off, (CONST_I32 0), 0, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper texternalsym:$off)),
-          (STORE32_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>;
+          (STORE32_I64 texternalsym:$off, (CONST_I32 0), 0, I64:$val)>;
 
 let Defs = [ARGUMENTS] in {
 
-// Memory size.
-def MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins),
-                        [(set I32:$dst, (int_wasm_memory_size))],
-                        "memory_size\t$dst">,
-                      Requires<[HasAddr32]>;
-def MEMORY_SIZE_I64 : I<(outs I64:$dst), (ins),
-                        [(set I64:$dst, (int_wasm_memory_size))],
-                        "memory_size\t$dst">,
-                      Requires<[HasAddr64]>;
+// Current memory size.
+def CURRENT_MEMORY_I32 : I<(outs I32:$dst), (ins),
+                           [(set I32:$dst, (int_wasm_current_memory))],
+                           "current_memory\t$dst">,
+                         Requires<[HasAddr32]>;
+def CURRENT_MEMORY_I64 : I<(outs I64:$dst), (ins),
+                           [(set I64:$dst, (int_wasm_current_memory))],
+                           "current_memory\t$dst">,
+                         Requires<[HasAddr64]>;
 
 // Grow memory.
 def GROW_MEMORY_I32 : I<(outs), (ins I32:$delta),
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
index b009a4e054cc..af53f3db967b 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -16,9 +16,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/Debug.h"
@@ -61,12 +61,12 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
   auto &MRI = MF.getRegInfo();
 
   for (auto &MBB : MF) {
-    for (auto MII = MBB.begin(); MII != MBB.end(); ) {
+    for (auto MII = MBB.begin(); MII != MBB.end();) {
       MachineInstr *MI = &*MII++;
       if (MI->getOpcode() != WebAssembly::BR_UNLESS)
         continue;
 
-      unsigned Cond = MI->getOperand(0).getReg();
+      unsigned Cond = MI->getOperand(1).getReg();
       bool Inverted = false;
 
       // Attempt to invert the condition in place.
@@ -74,7 +74,7 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
         assert(MRI.hasOneDef(Cond));
         MachineInstr *Def = MRI.getVRegDef(Cond);
         switch (Def->getOpcode()) {
-        using namespace WebAssembly;
+          using namespace WebAssembly;
         case EQ_I32: Def->setDesc(TII.get(NE_I32)); Inverted = true; break;
         case NE_I32: Def->setDesc(TII.get(EQ_I32)); Inverted = true; break;
         case GT_S_I32: Def->setDesc(TII.get(LE_S_I32)); Inverted = true; break;
@@ -106,15 +106,10 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
       // If we weren't able to invert the condition in place. Insert an
       // expression to invert it.
       if (!Inverted) {
-        unsigned ZeroReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-        MFI.stackifyVReg(ZeroReg);
-        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::CONST_I32), ZeroReg)
-            .addImm(0);
         unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
         MFI.stackifyVReg(Tmp);
-        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQ_I32), Tmp)
-            .addReg(Cond)
-            .addReg(ZeroReg);
+        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQZ_I32), Tmp)
+            .addReg(Cond);
         Cond = Tmp;
         Inverted = true;
       }
@@ -123,8 +118,8 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
       // delete the br_unless.
       assert(Inverted);
       BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF))
-          .addReg(Cond)
-          .addOperand(MI->getOperand(1));
+          .addOperand(MI->getOperand(0))
+          .addReg(Cond);
       MBB.erase(MI);
     }
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 6a60280900a9..89f607d84b71 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -39,18 +39,24 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
   ///   - defined and used in LIFO order with other stack registers
   BitVector VRegStackified;
 
-  // One entry for each possible target reg. we expect it to be small.
-  std::vector<unsigned> PhysRegs;
+  // A virtual register holding the pointer to the vararg buffer for vararg
+  // functions. It is created and set in TLI::LowerFormalArguments and read by
+  // TLI::LowerVASTART
+  unsigned VarargVreg = -1U;
 
-public:
-  explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {
-    PhysRegs.resize(WebAssembly::NUM_TARGET_REGS, -1U);
-  }
+ public:
+  explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {}
   ~WebAssemblyFunctionInfo() override;
 
   void addParam(MVT VT) { Params.push_back(VT); }
   const std::vector<MVT> &getParams() const { return Params; }
 
+  unsigned getVarargBufferVreg() const {
+    assert(VarargVreg != -1U && "Vararg vreg hasn't been set");
+    return VarargVreg;
+  }
+  void setVarargBufferVreg(unsigned Reg) { VarargVreg = Reg; }
+
   static const unsigned UnusedReg = -1u;
 
   void stackifyVReg(unsigned VReg) {
@@ -71,25 +77,15 @@ public:
     WARegs[TargetRegisterInfo::virtReg2Index(VReg)] = WAReg;
   }
   unsigned getWAReg(unsigned Reg) const {
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size());
-      return WARegs[TargetRegisterInfo::virtReg2Index(Reg)];
-    }
-    return PhysRegs[Reg];
-  }
-  // If new virtual registers are created after initWARegs has been called,
-  // this function can be used to add WebAssembly register mappings for them.
-  void addWAReg(unsigned VReg, unsigned WAReg) {
-    assert(VReg = WARegs.size());
-    WARegs.push_back(WAReg);
+    assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size());
+    return WARegs[TargetRegisterInfo::virtReg2Index(Reg)];
   }
 
-  void addPReg(unsigned PReg, unsigned WAReg) {
-    assert(PReg < WebAssembly::NUM_TARGET_REGS);
-    assert(WAReg < -1U);
-    PhysRegs[PReg] = WAReg;
+  // For a given stackified WAReg, return the id number to print with push/pop.
+  static unsigned getWARegStackId(unsigned Reg) {
+    assert(Reg & INT32_MIN);
+    return Reg & INT32_MAX;
   }
-  const std::vector<unsigned> &getPhysRegs() const { return PhysRegs; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
new file mode 100644
index 000000000000..473de7ddae7e
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -0,0 +1,105 @@
+//===--- WebAssemblyOptimizeLiveIntervals.cpp - LiveInterval processing ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Optimize LiveIntervals for use in a post-RA context.
+//
+/// LiveIntervals normally runs before register allocation when the code is
+/// only recently lowered out of SSA form, so it's uncommon for registers to
+/// have multiple defs, and then they do, the defs are usually closely related.
+/// Later, after coalescing, tail duplication, and other optimizations, it's
+/// more common to see registers with multiple unrelated defs. This pass
+/// updates LiveIntervalAnalysis to distribute the value numbers across separate
+/// LiveIntervals.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-optimize-live-intervals"
+
+namespace {
+class WebAssemblyOptimizeLiveIntervals final : public MachineFunctionPass {
+  const char *getPassName() const override {
+    return "WebAssembly Optimize Live Intervals";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(LiveVariablesID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyOptimizeLiveIntervals() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyOptimizeLiveIntervals::ID = 0;
+FunctionPass *llvm::createWebAssemblyOptimizeLiveIntervals() {
+  return new WebAssemblyOptimizeLiveIntervals();
+}
+
+bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Optimize LiveIntervals **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+
+  // We don't preserve SSA form.
+  MRI.leaveSSA();
+
+  assert(MRI.tracksLiveness() &&
+         "OptimizeLiveIntervals expects liveness");
+
+  // Split multiple-VN LiveIntervals into multiple LiveIntervals.
+  SmallVector<LiveInterval*, 4> SplitLIs;
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+
+    LIS.splitSeparateComponents(LIS.getInterval(Reg), SplitLIs);
+    SplitLIs.clear();
+  }
+
+  // In PrepareForLiveIntervals, we conservatively inserted IMPLICIT_DEF
+  // instructions to satisfy LiveIntervals' requirement that all uses be
+  // dominated by defs. Now that LiveIntervals has computed which of these
+  // defs are actually needed and which are dead, remove the dead ones.
+  for (auto MII = MF.begin()->begin(), MIE = MF.begin()->end(); MII != MIE; ) {
+    MachineInstr *MI = &*MII++;
+    if (MI->isImplicitDef() && MI->getOperand(0).isDead()) {
+      LiveInterval &LI = LIS.getInterval(MI->getOperand(0).getReg());
+      LIS.removeVRegDefAt(LI, LIS.getInstructionIndex(*MI).getRegSlot());
+      LIS.RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+    }
+  }
+
+  return false;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyPEI.cpp b/lib/Target/WebAssembly/WebAssemblyPEI.cpp
deleted file mode 100644
index d570d4266110..000000000000
--- a/lib/Target/WebAssembly/WebAssemblyPEI.cpp
+++ /dev/null
@@ -1,1066 +0,0 @@
-//===-- WebAssemblyPEI.cpp - Insert Prolog/Epilog code in function --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass is responsible for finalizing the functions frame layout, saving
-// callee saved registers, and for emitting prolog & epilog code for the
-// function.
-//
-// This pass must be run after register allocation.  After this pass is
-// executed, it is illegal to construct MO_FrameIndex operands.
-//
-// This is a copy of lib/CodeGen/PrologEpilogInserter.cpp except that it does
-// not assert that all virtual registers are gone (because WebAssembly currently
-// uses virtual rather than physical registers), and only runs
-// MRI.clearVirtRegs() if scavenging happened (which it never does). It also
-// uses a different class name so it can be registered via INITIALIZE_PASS.
-// It is otherwise unmodified, so any changes to the target-independent PEI
-// can be easily applied.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/IndexedMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/CodeGen/StackProtector.h"
-#include "llvm/CodeGen/WinEHFuncInfo.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include <climits>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "pei"
-namespace llvm {
-void initializeWasmPEIPass(PassRegistry&);
-}
-namespace {
-class WasmPEI : public MachineFunctionPass {
-public:
-  static char ID;
-  WasmPEI() : MachineFunctionPass(ID) {
-    initializeWasmPEIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-  /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
-  /// frame indexes with appropriate references.
-  ///
-  bool runOnMachineFunction(MachineFunction &Fn) override;
-
-private:
-  RegScavenger *RS;
-
-  // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved
-  // stack frame indexes.
-  unsigned MinCSFrameIndex, MaxCSFrameIndex;
-
-  // Save and Restore blocks of the current function. Typically there is a
-  // single save block, unless Windows EH funclets are involved.
-  SmallVector<MachineBasicBlock *, 1> SaveBlocks;
-  SmallVector<MachineBasicBlock *, 4> RestoreBlocks;
-
-  // Flag to control whether to use the register scavenger to resolve
-  // frame index materialization registers. Set according to
-  // TRI->requiresFrameIndexScavenging() for the current function.
-  bool FrameIndexVirtualScavenging;
-
-  void calculateSets(MachineFunction &Fn);
-  void calculateCallsInformation(MachineFunction &Fn);
-  void assignCalleeSavedSpillSlots(MachineFunction &Fn,
-                                   const BitVector &SavedRegs);
-  void insertCSRSpillsAndRestores(MachineFunction &Fn);
-  void calculateFrameObjectOffsets(MachineFunction &Fn);
-  void replaceFrameIndices(MachineFunction &Fn);
-  void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
-                           int &SPAdj);
-  void scavengeFrameVirtualRegs(MachineFunction &Fn);
-  void insertPrologEpilogCode(MachineFunction &Fn);
-};
-} // namespace
-
-char WasmPEI::ID = 0;
-
-namespace llvm {
-FunctionPass *createWebAssemblyPEI() {
-  return new WasmPEI();
-}
-}
-
-static cl::opt<unsigned>
-WarnStackSize("wasm-warn-stack-size", cl::Hidden, cl::init((unsigned)-1),
-              cl::desc("Warn for stack size bigger than the given"
-                       " number"));
-
-INITIALIZE_PASS_BEGIN(WasmPEI, "wasmprologepilog",
-                "Wasm Prologue/Epilogue Insertion", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(StackProtector)
-INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
-INITIALIZE_PASS_END(WasmPEI, "wasmprologepilog",
-                    "Wasm Prologue/Epilogue Insertion & Frame Finalization",
-                    false, false)
-
-STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
-STATISTIC(NumBytesStackSpace,
-          "Number of bytes used for stack in all functions");
-
-void WasmPEI::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-  AU.addPreserved<MachineLoopInfo>();
-  AU.addPreserved<MachineDominatorTree>();
-  AU.addRequired<StackProtector>();
-  AU.addRequired<TargetPassConfig>();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-/// Compute the set of return blocks
-void WasmPEI::calculateSets(MachineFunction &Fn) {
-  const MachineFrameInfo *MFI = Fn.getFrameInfo();
-
-  // Even when we do not change any CSR, we still want to insert the
-  // prologue and epilogue of the function.
-  // So set the save points for those.
-
-  // Use the points found by shrink-wrapping, if any.
-  if (MFI->getSavePoint()) {
-    SaveBlocks.push_back(MFI->getSavePoint());
-    assert(MFI->getRestorePoint() && "Both restore and save must be set");
-    MachineBasicBlock *RestoreBlock = MFI->getRestorePoint();
-    // If RestoreBlock does not have any successor and is not a return block
-    // then the end point is unreachable and we do not need to insert any
-    // epilogue.
-    if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock())
-      RestoreBlocks.push_back(RestoreBlock);
-    return;
-  }
-
-  // Save refs to entry and return blocks.
-  SaveBlocks.push_back(&Fn.front());
-  for (MachineBasicBlock &MBB : Fn) {
-    if (MBB.isEHFuncletEntry())
-      SaveBlocks.push_back(&MBB);
-    if (MBB.isReturnBlock())
-      RestoreBlocks.push_back(&MBB);
-  }
-}
-
-/// StackObjSet - A set of stack object indexes
-typedef SmallSetVector<int, 8> StackObjSet;
-
-/// runOnMachineFunction - Insert prolog/epilog code and replace abstract
-/// frame indexes with appropriate references.
-///
-bool WasmPEI::runOnMachineFunction(MachineFunction &Fn) {
-  const Function* F = Fn.getFunction();
-  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-
-  // LOCALMOD: assert removed from target-independent PEI
-  //assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs");
-
-  RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr;
-  FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn);
-
-  // Calculate the MaxCallFrameSize and AdjustsStack variables for the
-  // function's frame information. Also eliminates call frame pseudo
-  // instructions.
-  calculateCallsInformation(Fn);
-
-  // Determine which of the registers in the callee save list should be saved.
-  BitVector SavedRegs;
-  TFI->determineCalleeSaves(Fn, SavedRegs, RS);
-
-  // Insert spill code for any callee saved registers that are modified.
-  assignCalleeSavedSpillSlots(Fn, SavedRegs);
-
-  // Determine placement of CSR spill/restore code:
-  // place all spills in the entry block, all restores in return blocks.
-  calculateSets(Fn);
-
-  // Add the code to save and restore the callee saved registers.
-  if (!F->hasFnAttribute(Attribute::Naked))
-    insertCSRSpillsAndRestores(Fn);
-
-  // Allow the target machine to make final modifications to the function
-  // before the frame layout is finalized.
-  TFI->processFunctionBeforeFrameFinalized(Fn, RS);
-
-  // Calculate actual frame offsets for all abstract stack objects...
-  calculateFrameObjectOffsets(Fn);
-
-  // Add prolog and epilog code to the function.  This function is required
-  // to align the stack frame as necessary for any stack variables or
-  // called functions.  Because of this, calculateCalleeSavedRegisters()
-  // must be called before this function in order to set the AdjustsStack
-  // and MaxCallFrameSize variables.
-  if (!F->hasFnAttribute(Attribute::Naked))
-    insertPrologEpilogCode(Fn);
-
-  // Replace all MO_FrameIndex operands with physical register references
-  // and actual offsets.
-  //
-  replaceFrameIndices(Fn);
-
-  // If register scavenging is needed, as we've enabled doing it as a
-  // post-pass, scavenge the virtual registers that frame index elimination
-  // inserted.
-  if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) {
-    scavengeFrameVirtualRegs(Fn);
-    // Clear any vregs created by virtual scavenging.
-    // LOCALMOD: made this call conditional with scavengeFrameVirtualregs()
-    Fn.getRegInfo().clearVirtRegs();
-  }
-
-  // Warn on stack size when we exceeds the given limit.
-  MachineFrameInfo *MFI = Fn.getFrameInfo();
-  uint64_t StackSize = MFI->getStackSize();
-  if (WarnStackSize.getNumOccurrences() > 0 && WarnStackSize < StackSize) {
-    DiagnosticInfoStackSize DiagStackSize(*F, StackSize);
-    F->getContext().diagnose(DiagStackSize);
-  }
-
-  delete RS;
-  SaveBlocks.clear();
-  RestoreBlocks.clear();
-  return true;
-}
-
-/// calculateCallsInformation - Calculate the MaxCallFrameSize and AdjustsStack
-/// variables for the function's frame information and eliminate call frame
-/// pseudo instructions.
-void WasmPEI::calculateCallsInformation(MachineFunction &Fn) {
-  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  MachineFrameInfo *MFI = Fn.getFrameInfo();
-
-  unsigned MaxCallFrameSize = 0;
-  bool AdjustsStack = MFI->adjustsStack();
-
-  // Get the function call frame set-up and tear-down instruction opcode
-  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
-  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
-
-  // Early exit for targets which have no call frame setup/destroy pseudo
-  // instructions.
-  if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u)
-    return;
-
-  std::vector<MachineBasicBlock::iterator> FrameSDOps;
-  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)
-    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
-      if (I->getOpcode() == FrameSetupOpcode ||
-          I->getOpcode() == FrameDestroyOpcode) {
-        assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo"
-               " instructions should have a single immediate argument!");
-        unsigned Size = I->getOperand(0).getImm();
-        if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
-        AdjustsStack = true;
-        FrameSDOps.push_back(I);
-      } else if (I->isInlineAsm()) {
-        // Some inline asm's need a stack frame, as indicated by operand 1.
-        unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
-        if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
-          AdjustsStack = true;
-      }
-
-  MFI->setAdjustsStack(AdjustsStack);
-  MFI->setMaxCallFrameSize(MaxCallFrameSize);
-
-  for (std::vector<MachineBasicBlock::iterator>::iterator
-         i = FrameSDOps.begin(), e = FrameSDOps.end(); i != e; ++i) {
-    MachineBasicBlock::iterator I = *i;
-
-    // If call frames are not being included as part of the stack frame, and
-    // the target doesn't indicate otherwise, remove the call frame pseudos
-    // here. The sub/add sp instruction pairs are still inserted, but we don't
-    // need to track the SP adjustment for frame index elimination.
-    if (TFI->canSimplifyCallFramePseudos(Fn))
-      TFI->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I);
-  }
-}
-
-void WasmPEI::assignCalleeSavedSpillSlots(MachineFunction &F,
-                                      const BitVector &SavedRegs) {
-  // These are used to keep track the callee-save area. Initialize them.
-  MinCSFrameIndex = INT_MAX;
-  MaxCSFrameIndex = 0;
-
-  if (SavedRegs.empty())
-    return;
-
-  const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F);
-
-  std::vector<CalleeSavedInfo> CSI;
-  for (unsigned i = 0; CSRegs[i]; ++i) {
-    unsigned Reg = CSRegs[i];
-    if (SavedRegs.test(Reg))
-      CSI.push_back(CalleeSavedInfo(Reg));
-  }
-
-  const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering();
-  MachineFrameInfo *MFI = F.getFrameInfo();
-  if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI)) {
-    // If target doesn't implement this, use generic code.
-
-    if (CSI.empty())
-      return; // Early exit if no callee saved registers are modified!
-
-    unsigned NumFixedSpillSlots;
-    const TargetFrameLowering::SpillSlot *FixedSpillSlots =
-        TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots);
-
-    // Now that we know which registers need to be saved and restored, allocate
-    // stack slots for them.
-    for (std::vector<CalleeSavedInfo>::iterator I = CSI.begin(), E = CSI.end();
-         I != E; ++I) {
-      unsigned Reg = I->getReg();
-      const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
-
-      int FrameIdx;
-      if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) {
-        I->setFrameIdx(FrameIdx);
-        continue;
-      }
-
-      // Check to see if this physreg must be spilled to a particular stack slot
-      // on this target.
-      const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots;
-      while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots &&
-             FixedSlot->Reg != Reg)
-        ++FixedSlot;
-
-      if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
-        // Nope, just spill it anywhere convenient.
-        unsigned Align = RC->getAlignment();
-        unsigned StackAlign = TFI->getStackAlignment();
-
-        // We may not be able to satisfy the desired alignment specification of
-        // the TargetRegisterClass if the stack alignment is smaller. Use the
-        // min.
-        Align = std::min(Align, StackAlign);
-        FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true);
-        if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
-        if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
-      } else {
-        // Spill it to the stack where we must.
-        FrameIdx =
-            MFI->CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset);
-      }
-
-      I->setFrameIdx(FrameIdx);
-    }
-  }
-
-  MFI->setCalleeSavedInfo(CSI);
-}
-
-/// Helper function to update the liveness information for the callee-saved
-/// registers.
-static void updateLiveness(MachineFunction &MF) {
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  // Visited will contain all the basic blocks that are in the region
-  // where the callee saved registers are alive:
-  // - Anything that is not Save or Restore -> LiveThrough.
-  // - Save -> LiveIn.
-  // - Restore -> LiveOut.
-  // The live-out is not attached to the block, so no need to keep
-  // Restore in this set.
-  SmallPtrSet<MachineBasicBlock *, 8> Visited;
-  SmallVector<MachineBasicBlock *, 8> WorkList;
-  MachineBasicBlock *Entry = &MF.front();
-  MachineBasicBlock *Save = MFI->getSavePoint();
-
-  if (!Save)
-    Save = Entry;
-
-  if (Entry != Save) {
-    WorkList.push_back(Entry);
-    Visited.insert(Entry);
-  }
-  Visited.insert(Save);
-
-  MachineBasicBlock *Restore = MFI->getRestorePoint();
-  if (Restore)
-    // By construction Restore cannot be visited, otherwise it
-    // means there exists a path to Restore that does not go
-    // through Save.
-    WorkList.push_back(Restore);
-
-  while (!WorkList.empty()) {
-    const MachineBasicBlock *CurBB = WorkList.pop_back_val();
-    // By construction, the region that is after the save point is
-    // dominated by the Save and post-dominated by the Restore.
-    if (CurBB == Save && Save != Restore)
-      continue;
-    // Enqueue all the successors not already visited.
-    // Those are by construction either before Save or after Restore.
-    for (MachineBasicBlock *SuccBB : CurBB->successors())
-      if (Visited.insert(SuccBB).second)
-        WorkList.push_back(SuccBB);
-  }
-
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    for (MachineBasicBlock *MBB : Visited) {
-      MCPhysReg Reg = CSI[i].getReg();
-      // Add the callee-saved register as live-in.
-      // It's killed at the spill.
-      if (!MBB->isLiveIn(Reg))
-        MBB->addLiveIn(Reg);
-    }
-  }
-}
-
-/// insertCSRSpillsAndRestores - Insert spill and restore code for
-/// callee saved registers used in the function.
-///
-void WasmPEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
-  // Get callee saved register information.
-  MachineFrameInfo *MFI = Fn.getFrameInfo();
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-
-  MFI->setCalleeSavedInfoValid(true);
-
-  // Early exit if no callee saved registers are modified!
-  if (CSI.empty())
-    return;
-
-  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
-  MachineBasicBlock::iterator I;
-
-  // Spill using target interface.
-  for (MachineBasicBlock *SaveBlock : SaveBlocks) {
-    I = SaveBlock->begin();
-    if (!TFI->spillCalleeSavedRegisters(*SaveBlock, I, CSI, TRI)) {
-      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-        // Insert the spill to the stack frame.
-        unsigned Reg = CSI[i].getReg();
-        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-        TII.storeRegToStackSlot(*SaveBlock, I, Reg, true, CSI[i].getFrameIdx(),
-                                RC, TRI);
-      }
-    }
-    // Update the live-in information of all the blocks up to the save point.
-    updateLiveness(Fn);
-  }
-
-  // Restore using target interface.
-  for (MachineBasicBlock *MBB : RestoreBlocks) {
-    I = MBB->end();
-
-    // Skip over all terminator instructions, which are part of the return
-    // sequence.
-    MachineBasicBlock::iterator I2 = I;
-    while (I2 != MBB->begin() && (--I2)->isTerminator())
-      I = I2;
-
-    bool AtStart = I == MBB->begin();
-    MachineBasicBlock::iterator BeforeI = I;
-    if (!AtStart)
-      --BeforeI;
-
-    // Restore all registers immediately before the return and any
-    // terminators that precede it.
-    if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
-      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-        unsigned Reg = CSI[i].getReg();
-        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-        TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
-        assert(I != MBB->begin() &&
-               "loadRegFromStackSlot didn't insert any code!");
-        // Insert in reverse order.  loadRegFromStackSlot can insert
-        // multiple instructions.
-        if (AtStart)
-          I = MBB->begin();
-        else {
-          I = BeforeI;
-          ++I;
-        }
-      }
-    }
-  }
-}
-
-/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
-static inline void
-AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
-                  bool StackGrowsDown, int64_t &Offset,
-                  unsigned &MaxAlign, unsigned Skew) {
-  // If the stack grows down, add the object size to find the lowest address.
-  if (StackGrowsDown)
-    Offset += MFI->getObjectSize(FrameIdx);
-
-  unsigned Align = MFI->getObjectAlignment(FrameIdx);
-
-  // If the alignment of this object is greater than that of the stack, then
-  // increase the stack alignment to match.
-  MaxAlign = std::max(MaxAlign, Align);
-
-  // Adjust to alignment boundary.
-  Offset = RoundUpToAlignment(Offset, Align, Skew);
-
-  if (StackGrowsDown) {
-    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
-    MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
-  } else {
-    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
-    MFI->setObjectOffset(FrameIdx, Offset);
-    Offset += MFI->getObjectSize(FrameIdx);
-  }
-}
-
-/// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
-/// those required to be close to the Stack Protector) to stack offsets.
-static void
-AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
-                      SmallSet<int, 16> &ProtectedObjs,
-                      MachineFrameInfo *MFI, bool StackGrowsDown,
-                      int64_t &Offset, unsigned &MaxAlign, unsigned Skew) {
-
-  for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
-        E = UnassignedObjs.end(); I != E; ++I) {
-    int i = *I;
-    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
-    ProtectedObjs.insert(i);
-  }
-}
-
-/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
-/// abstract stack objects.
-///
-void WasmPEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
-  StackProtector *SP = &getAnalysis<StackProtector>();
-
-  bool StackGrowsDown =
-    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
-
-  // Loop over all of the stack objects, assigning sequential addresses...
-  MachineFrameInfo *MFI = Fn.getFrameInfo();
-
-  // Start at the beginning of the local area.
-  // The Offset is the distance from the stack top in the direction
-  // of stack growth -- so it's always nonnegative.
-  int LocalAreaOffset = TFI.getOffsetOfLocalArea();
-  if (StackGrowsDown)
-    LocalAreaOffset = -LocalAreaOffset;
-  assert(LocalAreaOffset >= 0
-         && "Local area offset should be in direction of stack growth");
-  int64_t Offset = LocalAreaOffset;
-
-  // Skew to be applied to alignment.
-  unsigned Skew = TFI.getStackAlignmentSkew(Fn);
-
-  // If there are fixed sized objects that are preallocated in the local area,
-  // non-fixed objects can't be allocated right at the start of local area.
-  // We currently don't support filling in holes in between fixed sized
-  // objects, so we adjust 'Offset' to point to the end of last fixed sized
-  // preallocated object.
-  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
-    int64_t FixedOff;
-    if (StackGrowsDown) {
-      // The maximum distance from the stack pointer is at lower address of
-      // the object -- which is given by offset. For down growing stack
-      // the offset is negative, so we negate the offset to get the distance.
-      FixedOff = -MFI->getObjectOffset(i);
-    } else {
-      // The maximum distance from the start pointer is at the upper
-      // address of the object.
-      FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
-    }
-    if (FixedOff > Offset) Offset = FixedOff;
-  }
-
-  // First assign frame offsets to stack objects that are used to spill
-  // callee saved registers.
-  if (StackGrowsDown) {
-    for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) {
-      // If the stack grows down, we need to add the size to find the lowest
-      // address of the object.
-      Offset += MFI->getObjectSize(i);
-
-      unsigned Align = MFI->getObjectAlignment(i);
-      // Adjust to alignment boundary
-      Offset = RoundUpToAlignment(Offset, Align, Skew);
-
-      MFI->setObjectOffset(i, -Offset);        // Set the computed offset
-    }
-  } else {
-    int MaxCSFI = MaxCSFrameIndex, MinCSFI = MinCSFrameIndex;
-    for (int i = MaxCSFI; i >= MinCSFI ; --i) {
-      unsigned Align = MFI->getObjectAlignment(i);
-      // Adjust to alignment boundary
-      Offset = RoundUpToAlignment(Offset, Align, Skew);
-
-      MFI->setObjectOffset(i, Offset);
-      Offset += MFI->getObjectSize(i);
-    }
-  }
-
-  unsigned MaxAlign = MFI->getMaxAlignment();
-
-  // Make sure the special register scavenging spill slot is closest to the
-  // incoming stack pointer if a frame pointer is required and is closer
-  // to the incoming rather than the final stack pointer.
-  const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
-  bool EarlyScavengingSlots = (TFI.hasFP(Fn) &&
-                               TFI.isFPCloseToIncomingSP() &&
-                               RegInfo->useFPForScavengingIndex(Fn) &&
-                               !RegInfo->needsStackRealignment(Fn));
-  if (RS && EarlyScavengingSlots) {
-    SmallVector<int, 2> SFIs;
-    RS->getScavengingFrameIndices(SFIs);
-    for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
-           IE = SFIs.end(); I != IE; ++I)
-      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew);
-  }
-
-  // FIXME: Once this is working, then enable flag will change to a target
-  // check for whether the frame is large enough to want to use virtual
-  // frame index registers. Functions which don't want/need this optimization
-  // will continue to use the existing code path.
-  if (MFI->getUseLocalStackAllocationBlock()) {
-    unsigned Align = MFI->getLocalFrameMaxAlign();
-
-    // Adjust to alignment boundary.
-    Offset = RoundUpToAlignment(Offset, Align, Skew);
-
-    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
-
-    // Resolve offsets for objects in the local block.
-    for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
-      std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
-      int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
-      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
-            FIOffset << "]\n");
-      MFI->setObjectOffset(Entry.first, FIOffset);
-    }
-    // Allocate the local block
-    Offset += MFI->getLocalFrameSize();
-
-    MaxAlign = std::max(Align, MaxAlign);
-  }
-
-  // Make sure that the stack protector comes before the local variables on the
-  // stack.
-  SmallSet<int, 16> ProtectedObjs;
-  if (MFI->getStackProtectorIndex() >= 0) {
-    StackObjSet LargeArrayObjs;
-    StackObjSet SmallArrayObjs;
-    StackObjSet AddrOfObjs;
-
-    AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown,
-                      Offset, MaxAlign, Skew);
-
-    // Assign large stack objects first.
-    for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
-      if (MFI->isObjectPreAllocated(i) &&
-          MFI->getUseLocalStackAllocationBlock())
-        continue;
-      if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
-        continue;
-      if (RS && RS->isScavengingFrameIndex((int)i))
-        continue;
-      if (MFI->isDeadObjectIndex(i))
-        continue;
-      if (MFI->getStackProtectorIndex() == (int)i)
-        continue;
-
-      switch (SP->getSSPLayout(MFI->getObjectAllocation(i))) {
-      case StackProtector::SSPLK_None:
-        continue;
-      case StackProtector::SSPLK_SmallArray:
-        SmallArrayObjs.insert(i);
-        continue;
-      case StackProtector::SSPLK_AddrOf:
-        AddrOfObjs.insert(i);
-        continue;
-      case StackProtector::SSPLK_LargeArray:
-        LargeArrayObjs.insert(i);
-        continue;
-      }
-      llvm_unreachable("Unexpected SSPLayoutKind.");
-    }
-
-    AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
-                          Offset, MaxAlign, Skew);
-    AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
-                          Offset, MaxAlign, Skew);
-    AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown,
-                          Offset, MaxAlign, Skew);
-  }
-
-  // Then assign frame offsets to stack objects that are not used to spill
-  // callee saved registers.
-  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
-    if (MFI->isObjectPreAllocated(i) &&
-        MFI->getUseLocalStackAllocationBlock())
-      continue;
-    if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
-      continue;
-    if (RS && RS->isScavengingFrameIndex((int)i))
-      continue;
-    if (MFI->isDeadObjectIndex(i))
-      continue;
-    if (MFI->getStackProtectorIndex() == (int)i)
-      continue;
-    if (ProtectedObjs.count(i))
-      continue;
-
-    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
-  }
-
-  // Make sure the special register scavenging spill slot is closest to the
-  // stack pointer.
-  if (RS && !EarlyScavengingSlots) {
-    SmallVector<int, 2> SFIs;
-    RS->getScavengingFrameIndices(SFIs);
-    for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
-           IE = SFIs.end(); I != IE; ++I)
-      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew);
-  }
-
-  if (!TFI.targetHandlesStackFrameRounding()) {
-    // If we have reserved argument space for call sites in the function
-    // immediately on entry to the current function, count it as part of the
-    // overall stack size.
-    if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
-      Offset += MFI->getMaxCallFrameSize();
-
-    // Round up the size to a multiple of the alignment.  If the function has
-    // any calls or alloca's, align to the target's StackAlignment value to
-    // ensure that the callee's frame or the alloca data is suitably aligned;
-    // otherwise, for leaf functions, align to the TransientStackAlignment
-    // value.
-    unsigned StackAlign;
-    if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
-        (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
-      StackAlign = TFI.getStackAlignment();
-    else
-      StackAlign = TFI.getTransientStackAlignment();
-
-    // If the frame pointer is eliminated, all frame offsets will be relative to
-    // SP not FP. Align to MaxAlign so this works.
-    StackAlign = std::max(StackAlign, MaxAlign);
-    Offset = RoundUpToAlignment(Offset, StackAlign, Skew);
-  }
-
-  // Update frame info to pretend that this is part of the stack...
-  int64_t StackSize = Offset - LocalAreaOffset;
-  MFI->setStackSize(StackSize);
-  NumBytesStackSpace += StackSize;
-}
-
-/// insertPrologEpilogCode - Scan the function for modified callee saved
-/// registers, insert spill code for these callee saved registers, then add
-/// prolog and epilog code to the function.
-///
-void WasmPEI::insertPrologEpilogCode(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
-
-  // Add prologue to the function...
-  for (MachineBasicBlock *SaveBlock : SaveBlocks)
-    TFI.emitPrologue(Fn, *SaveBlock);
-
-  // Add epilogue to restore the callee-save registers in each exiting block.
-  for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
-    TFI.emitEpilogue(Fn, *RestoreBlock);
-
-  for (MachineBasicBlock *SaveBlock : SaveBlocks)
-    TFI.inlineStackProbe(Fn, *SaveBlock);
-
-  // Emit additional code that is required to support segmented stacks, if
-  // we've been asked for it.  This, when linked with a runtime with support
-  // for segmented stacks (libgcc is one), will result in allocating stack
-  // space in small chunks instead of one large contiguous block.
-  if (Fn.shouldSplitStack()) {
-    for (MachineBasicBlock *SaveBlock : SaveBlocks)
-      TFI.adjustForSegmentedStacks(Fn, *SaveBlock);
-  }
-
-  // Emit additional code that is required to explicitly handle the stack in
-  // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The
-  // approach is rather similar to that of Segmented Stacks, but it uses a
-  // different conditional check and another BIF for allocating more stack
-  // space.
-  if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE)
-    for (MachineBasicBlock *SaveBlock : SaveBlocks)
-      TFI.adjustForHiPEPrologue(Fn, *SaveBlock);
-}
-
-/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
-/// register references and actual offsets.
-///
-void WasmPEI::replaceFrameIndices(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
-  if (!TFI.needsFrameIndexResolution(Fn)) return;
-
-  // Store SPAdj at exit of a basic block.
-  SmallVector<int, 8> SPState;
-  SPState.resize(Fn.getNumBlockIDs());
-  SmallPtrSet<MachineBasicBlock*, 8> Reachable;
-
-  // Iterate over the reachable blocks in DFS order.
-  for (auto DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable);
-       DFI != DFE; ++DFI) {
-    int SPAdj = 0;
-    // Check the exit state of the DFS stack predecessor.
-    if (DFI.getPathLength() >= 2) {
-      MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2);
-      assert(Reachable.count(StackPred) &&
-             "DFS stack predecessor is already visited.\n");
-      SPAdj = SPState[StackPred->getNumber()];
-    }
-    MachineBasicBlock *BB = *DFI;
-    replaceFrameIndices(BB, Fn, SPAdj);
-    SPState[BB->getNumber()] = SPAdj;
-  }
-
-  // Handle the unreachable blocks.
-  for (auto &BB : Fn) {
-    if (Reachable.count(&BB))
-      // Already handled in DFS traversal.
-      continue;
-    int SPAdj = 0;
-    replaceFrameIndices(&BB, Fn, SPAdj);
-  }
-}
-
-void WasmPEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
-                              int &SPAdj) {
-  assert(Fn.getSubtarget().getRegisterInfo() &&
-         "getRegisterInfo() must be implemented!");
-  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
-  const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
-  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
-  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
-
-  if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
-
-  bool InsideCallSequence = false;
-
-  for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
-
-    if (I->getOpcode() == FrameSetupOpcode ||
-        I->getOpcode() == FrameDestroyOpcode) {
-      InsideCallSequence = (I->getOpcode() == FrameSetupOpcode);
-      SPAdj += TII.getSPAdjust(I);
-
-      MachineBasicBlock::iterator PrevI = BB->end();
-      if (I != BB->begin()) PrevI = std::prev(I);
-      TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
-
-      // Visit the instructions created by eliminateCallFramePseudoInstr().
-      if (PrevI == BB->end())
-        I = BB->begin();     // The replaced instr was the first in the block.
-      else
-        I = std::next(PrevI);
-      continue;
-    }
-
-    MachineInstr *MI = I;
-    bool DoIncr = true;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      if (!MI->getOperand(i).isFI())
-        continue;
-
-      // Frame indices in debug values are encoded in a target independent
-      // way with simply the frame index and offset rather than any
-      // target-specific addressing mode.
-      if (MI->isDebugValue()) {
-        assert(i == 0 && "Frame indices can only appear as the first "
-                         "operand of a DBG_VALUE machine instruction");
-        unsigned Reg;
-        MachineOperand &Offset = MI->getOperand(1);
-        Offset.setImm(Offset.getImm() +
-                      TFI->getFrameIndexReference(
-                          Fn, MI->getOperand(0).getIndex(), Reg));
-        MI->getOperand(0).ChangeToRegister(Reg, false /*isDef*/);
-        continue;
-      }
-
-      // TODO: This code should be commoned with the code for
-      // PATCHPOINT. There's no good reason for the difference in
-      // implementation other than historical accident.  The only
-      // remaining difference is the unconditional use of the stack
-      // pointer as the base register.
-      if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
-        assert((!MI->isDebugValue() || i == 0) &&
-               "Frame indicies can only appear as the first operand of a "
-               "DBG_VALUE machine instruction");
-        unsigned Reg;
-        MachineOperand &Offset = MI->getOperand(i + 1);
-        const unsigned refOffset =
-          TFI->getFrameIndexReferenceFromSP(Fn, MI->getOperand(i).getIndex(),
-                                            Reg);
-
-        Offset.setImm(Offset.getImm() + refOffset);
-        MI->getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
-        continue;
-      }
-
-      // Some instructions (e.g. inline asm instructions) can have
-      // multiple frame indices and/or cause eliminateFrameIndex
-      // to insert more than one instruction. We need the register
-      // scavenger to go through all of these instructions so that
-      // it can update its register information. We keep the
-      // iterator at the point before insertion so that we can
-      // revisit them in full.
-      bool AtBeginning = (I == BB->begin());
-      if (!AtBeginning) --I;
-
-      // If this instruction has a FrameIndex operand, we need to
-      // use that target machine register info object to eliminate
-      // it.
-      TRI.eliminateFrameIndex(MI, SPAdj, i,
-                              FrameIndexVirtualScavenging ?  nullptr : RS);
-
-      // Reset the iterator if we were at the beginning of the BB.
-      if (AtBeginning) {
-        I = BB->begin();
-        DoIncr = false;
-      }
-
-      MI = nullptr;
-      break;
-    }
-
-    // If we are looking at a call sequence, we need to keep track of
-    // the SP adjustment made by each instruction in the sequence.
-    // This includes both the frame setup/destroy pseudos (handled above),
-    // as well as other instructions that have side effects w.r.t the SP.
-    // Note that this must come after eliminateFrameIndex, because 
-    // if I itself referred to a frame index, we shouldn't count its own
-    // adjustment.
-    if (MI && InsideCallSequence)
-      SPAdj += TII.getSPAdjust(MI);
-
-    if (DoIncr && I != BB->end()) ++I;
-
-    // Update register states.
-    if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
-  }
-}
-
-/// scavengeFrameVirtualRegs - Replace all frame index virtual registers
-/// with physical registers. Use the register scavenger to find an
-/// appropriate register to use.
-///
-/// FIXME: Iterating over the instruction stream is unnecessary. We can simply
-/// iterate over the vreg use list, which at this point only contains machine
-/// operands for which eliminateFrameIndex need a new scratch reg.
-void
-WasmPEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
-  // Run through the instructions and find any virtual registers.
-  for (MachineFunction::iterator BB = Fn.begin(),
-       E = Fn.end(); BB != E; ++BB) {
-    RS->enterBasicBlock(&*BB);
-
-    int SPAdj = 0;
-
-    // The instruction stream may change in the loop, so check BB->end()
-    // directly.
-    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
-      // We might end up here again with a NULL iterator if we scavenged a
-      // register for which we inserted spill code for definition by what was
-      // originally the first instruction in BB.
-      if (I == MachineBasicBlock::iterator(nullptr))
-        I = BB->begin();
-
-      MachineInstr *MI = I;
-      MachineBasicBlock::iterator J = std::next(I);
-      MachineBasicBlock::iterator P =
-                         I == BB->begin() ? MachineBasicBlock::iterator(nullptr)
-                                          : std::prev(I);
-
-      // RS should process this instruction before we might scavenge at this
-      // location. This is because we might be replacing a virtual register
-      // defined by this instruction, and if so, registers killed by this
-      // instruction are available, and defined registers are not.
-      RS->forward(I);
-
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        if (MI->getOperand(i).isReg()) {
-          MachineOperand &MO = MI->getOperand(i);
-          unsigned Reg = MO.getReg();
-          if (Reg == 0)
-            continue;
-          if (!TargetRegisterInfo::isVirtualRegister(Reg))
-            continue;
-
-          // When we first encounter a new virtual register, it
-          // must be a definition.
-          assert(MI->getOperand(i).isDef() &&
-                 "frame index virtual missing def!");
-          // Scavenge a new scratch register
-          const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
-          unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj);
-
-          ++NumScavengedRegs;
-
-          // Replace this reference to the virtual register with the
-          // scratch register.
-          assert (ScratchReg && "Missing scratch register!");
-          Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
-          
-          // Because this instruction was processed by the RS before this
-          // register was allocated, make sure that the RS now records the
-          // register as being used.
-          RS->setRegUsed(ScratchReg);
-        }
-      }
-
-      // If the scavenger needed to use one of its spill slots, the
-      // spill code will have been inserted in between I and J. This is a
-      // problem because we need the spill code before I: Move I to just
-      // prior to J.
-      if (I != std::prev(J)) {
-        BB->splice(J, &*BB, I);
-
-        // Before we move I, we need to prepare the RS to visit I again.
-        // Specifically, RS will assert if it sees uses of registers that
-        // it believes are undefined. Because we have already processed
-        // register kills in I, when it visits I again, it will believe that
-        // those registers are undefined. To avoid this situation, unprocess
-        // the instruction I.
-        assert(RS->getCurrentPosition() == I &&
-          "The register scavenger has an unexpected position");
-        I = P;
-        RS->unprocess(P);
-      } else
-        ++I;
-    }
-  }
-}
diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index 4ad6eed7385b..56d44e6466eb 100644
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -12,14 +12,23 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-peephole"
 
+static cl::opt<bool> DisableWebAssemblyFallthroughReturnOpt(
+    "disable-wasm-fallthrough-return-opt", cl::Hidden,
+    cl::desc("WebAssembly: Disable fallthrough-return optimizations."),
+    cl::init(false));
+
 namespace {
 class WebAssemblyPeephole final : public MachineFunctionPass {
   const char *getPassName() const override {
@@ -28,6 +37,7 @@ class WebAssemblyPeephole final : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -44,11 +54,65 @@ FunctionPass *llvm::createWebAssemblyPeephole() {
   return new WebAssemblyPeephole();
 }
 
-bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
+/// If desirable, rewrite NewReg to a drop register.
+static bool MaybeRewriteToDrop(unsigned OldReg, unsigned NewReg,
+                               MachineOperand &MO, WebAssemblyFunctionInfo &MFI,
+                               MachineRegisterInfo &MRI) {
   bool Changed = false;
+  if (OldReg == NewReg) {
+    Changed = true;
+    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+    MO.setReg(NewReg);
+    MO.setIsDead();
+    MFI.stackifyVReg(NewReg);
+  }
+  return Changed;
+}
+
+static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
+                                      const MachineFunction &MF,
+                                      WebAssemblyFunctionInfo &MFI,
+                                      MachineRegisterInfo &MRI,
+                                      const WebAssemblyInstrInfo &TII,
+                                      unsigned FallthroughOpc,
+                                      unsigned CopyLocalOpc) {
+  if (DisableWebAssemblyFallthroughReturnOpt)
+    return false;
+  if (&MBB != &MF.back())
+    return false;
+  if (&MI != &MBB.back())
+    return false;
+
+  // If the operand isn't stackified, insert a COPY_LOCAL to read the operand
+  // and stackify it.
+  MachineOperand &MO = MI.getOperand(0);
+  unsigned Reg = MO.getReg();
+  if (!MFI.isVRegStackified(Reg)) {
+    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+    BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
+        .addReg(Reg);
+    MO.setReg(NewReg);
+    MFI.stackifyVReg(NewReg);
+  }
+
+  // Rewrite the return.
+  MI.setDesc(TII.get(FallthroughOpc));
+  return true;
+}
+
+bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Peephole **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  const WebAssemblyTargetLowering &TLI =
+      *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
+  auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  bool Changed = false;
 
   for (auto &MBB : MF)
     for (auto &MI : MBB)
@@ -66,20 +130,67 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::STORE_I64: {
         // Store instructions return their value operand. If we ended up using
         // the same register for both, replace it with a dead def so that it
-        // can use $discard instead.
+        // can use $drop instead.
         MachineOperand &MO = MI.getOperand(0);
         unsigned OldReg = MO.getReg();
-        // TODO: Handle SP/physregs
-        if (OldReg == MI.getOperand(3).getReg()
-            && TargetRegisterInfo::isVirtualRegister(MI.getOperand(3).getReg())) {
-          Changed = true;
-          unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
-          MO.setReg(NewReg);
-          MO.setIsDead();
-          MFI.stackifyVReg(NewReg);
-          MFI.addWAReg(NewReg, WebAssemblyFunctionInfo::UnusedReg);
+        unsigned NewReg =
+            MI.getOperand(WebAssembly::StoreValueOperandNo).getReg();
+        Changed |= MaybeRewriteToDrop(OldReg, NewReg, MO, MFI, MRI);
+        break;
+      }
+      case WebAssembly::CALL_I32:
+      case WebAssembly::CALL_I64: {
+        MachineOperand &Op1 = MI.getOperand(1);
+        if (Op1.isSymbol()) {
+          StringRef Name(Op1.getSymbolName());
+          if (Name == TLI.getLibcallName(RTLIB::MEMCPY) ||
+              Name == TLI.getLibcallName(RTLIB::MEMMOVE) ||
+              Name == TLI.getLibcallName(RTLIB::MEMSET)) {
+            LibFunc::Func Func;
+            if (LibInfo.getLibFunc(Name, Func)) {
+              const auto &Op2 = MI.getOperand(2);
+              if (!Op2.isReg())
+                report_fatal_error("Peephole: call to builtin function with "
+                                   "wrong signature, not consuming reg");
+              MachineOperand &MO = MI.getOperand(0);
+              unsigned OldReg = MO.getReg();
+              unsigned NewReg = Op2.getReg();
+
+              if (MRI.getRegClass(NewReg) != MRI.getRegClass(OldReg))
+                report_fatal_error("Peephole: call to builtin function with "
+                                   "wrong signature, from/to mismatch");
+              Changed |= MaybeRewriteToDrop(OldReg, NewReg, MO, MFI, MRI);
+            }
+          }
         }
+        break;
       }
+      // Optimize away an explicit void return at the end of the function.
+      case WebAssembly::RETURN_I32:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I32,
+            WebAssembly::COPY_LOCAL_I32);
+        break;
+      case WebAssembly::RETURN_I64:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I64,
+            WebAssembly::COPY_LOCAL_I64);
+        break;
+      case WebAssembly::RETURN_F32:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F32,
+            WebAssembly::COPY_LOCAL_F32);
+        break;
+      case WebAssembly::RETURN_F64:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F64,
+            WebAssembly::COPY_LOCAL_F64);
+        break;
+      case WebAssembly::RETURN_VOID:
+        if (!DisableWebAssemblyFallthroughReturnOpt &&
+            &MBB == &MF.back() && &MI == &MBB.back())
+          MI.setDesc(TII.get(WebAssembly::FALLTHROUGH_RETURN_VOID));
+        break;
       }
 
   return Changed;
diff --git a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
new file mode 100644
index 000000000000..30444ac598a4
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -0,0 +1,136 @@
+//===- WebAssemblyPrepareForLiveIntervals.cpp - Prepare for LiveIntervals -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Fix up code to meet LiveInterval's requirements.
+///
+/// Some CodeGen passes don't preserve LiveInterval's requirements, because
+/// they run after register allocation and it isn't important. However,
+/// WebAssembly runs LiveIntervals in a late pass. This pass transforms code
+/// to meet LiveIntervals' requirements; primarily, it ensures that all
+/// virtual register uses have definitions (IMPLICIT_DEF definitions if
+/// nothing else).
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-prepare-for-live-intervals"
+
+namespace {
+class WebAssemblyPrepareForLiveIntervals final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyPrepareForLiveIntervals() : MachineFunctionPass(ID) {}
+
+private:
+  const char *getPassName() const override {
+    return "WebAssembly Prepare For LiveIntervals";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyPrepareForLiveIntervals::ID = 0;
+FunctionPass *llvm::createWebAssemblyPrepareForLiveIntervals() {
+  return new WebAssemblyPrepareForLiveIntervals();
+}
+
+/// Test whether the given instruction is an ARGUMENT.
+static bool IsArgument(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case WebAssembly::ARGUMENT_I32:
+  case WebAssembly::ARGUMENT_I64:
+  case WebAssembly::ARGUMENT_F32:
+  case WebAssembly::ARGUMENT_F64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Test whether the given register has an ARGUMENT def.
+static bool HasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+  for (auto &Def : MRI.def_instructions(Reg))
+    if (IsArgument(&Def))
+      return true;
+  return false;
+}
+
+bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Prepare For LiveIntervals **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  MachineBasicBlock &Entry = *MF.begin();
+
+  assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
+         "LiveIntervals shouldn't be active yet!");
+
+  // We don't preserve SSA form.
+  MRI.leaveSSA();
+
+  // BranchFolding and perhaps other passes don't preserve IMPLICIT_DEF
+  // instructions. LiveIntervals requires that all paths to virtual register
+  // uses provide a definition. Insert IMPLICIT_DEFs in the entry block to
+  // conservatively satisfy this.
+  //
+  // TODO: This is fairly heavy-handed; find a better approach.
+  //
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+
+    // Skip unused registers.
+    if (MRI.use_nodbg_empty(Reg))
+      continue;
+
+    // Skip registers that have an ARGUMENT definition.
+    if (HasArgumentDef(Reg, MRI))
+      continue;
+
+    BuildMI(Entry, Entry.begin(), DebugLoc(),
+            TII.get(WebAssembly::IMPLICIT_DEF), Reg);
+    Changed = true;
+  }
+
+  // Move ARGUMENT_* instructions to the top of the entry block, so that their
+  // liveness reflects the fact that these really are live-in values.
+  for (auto MII = Entry.begin(), MIE = Entry.end(); MII != MIE; ) {
+    MachineInstr *MI = &*MII++;
+    if (IsArgument(MI)) {
+      MI->removeFromParent();
+      Entry.insert(Entry.begin(), MI);
+    }
+  }
+
+  // Ok, we're now ready to run LiveIntervalAnalysis again.
+  MF.getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
+
+  return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index 9ec66595d8da..dedd9108dfd5 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -66,7 +66,7 @@ static float computeWeight(const MachineRegisterInfo *MRI,
   float weight = 0.0f;
   for (MachineOperand &MO : MRI->reg_nodbg_operands(VReg))
     weight += LiveIntervals::getSpillWeight(MO.isDef(), MO.isUse(), MBFI,
-                                            MO.getParent());
+                                            *MO.getParent());
   return weight;
 }
 
@@ -99,7 +99,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
     unsigned VReg = TargetRegisterInfo::index2VirtReg(i);
     if (MFI.isVRegStackified(VReg))
       continue;
-    // Skip unused registers, which can use $discard.
+    // Skip unused registers, which can use $drop.
     if (MRI->use_empty(VReg))
       continue;
 
diff --git a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index f621db070b5b..4a8fd96f8324 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -18,8 +18,8 @@
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/ADT/SCCIterator.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -61,7 +61,6 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
 
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const MachineFrameInfo &FrameInfo = *MF.getFrameInfo();
 
   MFI.initWARegs();
 
@@ -73,9 +72,13 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
     case WebAssembly::ARGUMENT_I32:
     case WebAssembly::ARGUMENT_I64:
     case WebAssembly::ARGUMENT_F32:
-    case WebAssembly::ARGUMENT_F64:
-      MFI.setWAReg(MI.getOperand(0).getReg(), MI.getOperand(1).getImm());
+    case WebAssembly::ARGUMENT_F64: {
+      int64_t Imm = MI.getOperand(1).getImm();
+      DEBUG(dbgs() << "Arg VReg " << MI.getOperand(0).getReg() << " -> WAReg "
+                   << Imm << "\n");
+      MFI.setWAReg(MI.getOperand(0).getReg(), Imm);
       break;
+    }
     default:
       break;
     }
@@ -84,26 +87,27 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
   // Then assign regular WebAssembly registers for all remaining used
   // virtual registers. TODO: Consider sorting the registers by frequency of
   // use, to maximize usage of small immediate fields.
-  unsigned NumArgRegs = MFI.getParams().size();
   unsigned NumVRegs = MF.getRegInfo().getNumVirtRegs();
   unsigned NumStackRegs = 0;
-  unsigned CurReg = 0;
+  // Start the numbering for locals after the arg regs
+  unsigned CurReg = MFI.getParams().size();
   for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) {
     unsigned VReg = TargetRegisterInfo::index2VirtReg(VRegIdx);
+    // Skip unused registers.
+    if (MRI.use_empty(VReg))
+      continue;
     // Handle stackified registers.
     if (MFI.isVRegStackified(VReg)) {
+      DEBUG(dbgs() << "VReg " << VReg << " -> WAReg "
+                   << (INT32_MIN | NumStackRegs) << "\n");
       MFI.setWAReg(VReg, INT32_MIN | NumStackRegs++);
       continue;
     }
-    // Skip unused registers.
-    if (MRI.use_empty(VReg))
-      continue;
-    if (MFI.getWAReg(VReg) == WebAssemblyFunctionInfo::UnusedReg)
-      MFI.setWAReg(VReg, NumArgRegs + CurReg++);
+    if (MFI.getWAReg(VReg) == WebAssemblyFunctionInfo::UnusedReg) {
+      DEBUG(dbgs() << "VReg " << VReg << " -> WAReg " << CurReg << "\n");
+      MFI.setWAReg(VReg, CurReg++);
+    }
   }
-  // Allocate locals for used physical registers
-  if (FrameInfo.getStackSize() > 0)
-    MFI.addPReg(WebAssembly::SP32, CurReg++);
 
   return true;
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 537c147e6142..0aa3b621da32 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -23,9 +23,12 @@
 #include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_*
 #include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
@@ -43,12 +46,13 @@ class WebAssemblyRegStackify final : public MachineFunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<MachineDominatorTree>();
     AU.addRequired<LiveIntervals>();
     AU.addPreserved<MachineBlockFrequencyInfo>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
-    AU.addPreservedID(MachineDominatorsID);
     AU.addPreservedID(LiveVariablesID);
+    AU.addPreserved<MachineDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -82,17 +86,197 @@ static void ImposeStackOrdering(MachineInstr *MI) {
                                              /*isImp=*/true));
 }
 
+// Determine whether a call to the callee referenced by
+// MI->getOperand(CalleeOpNo) reads memory, writes memory, and/or has side
+// effects.
+static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
+                        bool &Write, bool &Effects, bool &StackPointer) {
+  // All calls can use the stack pointer.
+  StackPointer = true;
+
+  const MachineOperand &MO = MI.getOperand(CalleeOpNo);
+  if (MO.isGlobal()) {
+    const Constant *GV = MO.getGlobal();
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      if (!GA->isInterposable())
+        GV = GA->getAliasee();
+
+    if (const Function *F = dyn_cast<Function>(GV)) {
+      if (!F->doesNotThrow())
+        Effects = true;
+      if (F->doesNotAccessMemory())
+        return;
+      if (F->onlyReadsMemory()) {
+        Read = true;
+        return;
+      }
+    }
+  }
+
+  // Assume the worst.
+  Write = true;
+  Read = true;
+  Effects = true;
+}
+
+// Determine whether MI reads memory, writes memory, has side effects,
+// and/or uses the __stack_pointer value.
+static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
+                  bool &Write, bool &Effects, bool &StackPointer) {
+  assert(!MI.isPosition());
+  assert(!MI.isTerminator());
+
+  if (MI.isDebugValue())
+    return;
+
+  // Check for loads.
+  if (MI.mayLoad() && !MI.isInvariantLoad(&AA))
+    Read = true;
+
+  // Check for stores.
+  if (MI.mayStore()) {
+    Write = true;
+
+    // Check for stores to __stack_pointer.
+    for (auto MMO : MI.memoperands()) {
+      const MachinePointerInfo &MPI = MMO->getPointerInfo();
+      if (MPI.V.is<const PseudoSourceValue *>()) {
+        auto PSV = MPI.V.get<const PseudoSourceValue *>();
+        if (const ExternalSymbolPseudoSourceValue *EPSV =
+                dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
+          if (StringRef(EPSV->getSymbol()) == "__stack_pointer")
+            StackPointer = true;
+      }
+    }
+  } else if (MI.hasOrderedMemoryRef()) {
+    switch (MI.getOpcode()) {
+    case WebAssembly::DIV_S_I32: case WebAssembly::DIV_S_I64:
+    case WebAssembly::REM_S_I32: case WebAssembly::REM_S_I64:
+    case WebAssembly::DIV_U_I32: case WebAssembly::DIV_U_I64:
+    case WebAssembly::REM_U_I32: case WebAssembly::REM_U_I64:
+    case WebAssembly::I32_TRUNC_S_F32: case WebAssembly::I64_TRUNC_S_F32:
+    case WebAssembly::I32_TRUNC_S_F64: case WebAssembly::I64_TRUNC_S_F64:
+    case WebAssembly::I32_TRUNC_U_F32: case WebAssembly::I64_TRUNC_U_F32:
+    case WebAssembly::I32_TRUNC_U_F64: case WebAssembly::I64_TRUNC_U_F64:
+      // These instruction have hasUnmodeledSideEffects() returning true
+      // because they trap on overflow and invalid so they can't be arbitrarily
+      // moved, however hasOrderedMemoryRef() interprets this plus their lack
+      // of memoperands as having a potential unknown memory reference.
+      break;
+    default:
+      // Record volatile accesses, unless it's a call, as calls are handled
+      // specially below.
+      if (!MI.isCall()) {
+        Write = true;
+        Effects = true;
+      }
+      break;
+    }
+  }
+
+  // Check for side effects.
+  if (MI.hasUnmodeledSideEffects()) {
+    switch (MI.getOpcode()) {
+    case WebAssembly::DIV_S_I32: case WebAssembly::DIV_S_I64:
+    case WebAssembly::REM_S_I32: case WebAssembly::REM_S_I64:
+    case WebAssembly::DIV_U_I32: case WebAssembly::DIV_U_I64:
+    case WebAssembly::REM_U_I32: case WebAssembly::REM_U_I64:
+    case WebAssembly::I32_TRUNC_S_F32: case WebAssembly::I64_TRUNC_S_F32:
+    case WebAssembly::I32_TRUNC_S_F64: case WebAssembly::I64_TRUNC_S_F64:
+    case WebAssembly::I32_TRUNC_U_F32: case WebAssembly::I64_TRUNC_U_F32:
+    case WebAssembly::I32_TRUNC_U_F64: case WebAssembly::I64_TRUNC_U_F64:
+      // These instructions have hasUnmodeledSideEffects() returning true
+      // because they trap on overflow and invalid so they can't be arbitrarily
+      // moved, however in the specific case of register stackifying, it is safe
+      // to move them because overflow and invalid are Undefined Behavior.
+      break;
+    default:
+      Effects = true;
+      break;
+    }
+  }
+
+  // Analyze calls.
+  if (MI.isCall()) {
+    switch (MI.getOpcode()) {
+    case WebAssembly::CALL_VOID:
+    case WebAssembly::CALL_INDIRECT_VOID:
+      QueryCallee(MI, 0, Read, Write, Effects, StackPointer);
+      break;
+    case WebAssembly::CALL_I32: case WebAssembly::CALL_I64:
+    case WebAssembly::CALL_F32: case WebAssembly::CALL_F64:
+    case WebAssembly::CALL_INDIRECT_I32: case WebAssembly::CALL_INDIRECT_I64:
+    case WebAssembly::CALL_INDIRECT_F32: case WebAssembly::CALL_INDIRECT_F64:
+      QueryCallee(MI, 1, Read, Write, Effects, StackPointer);
+      break;
+    default:
+      llvm_unreachable("unexpected call opcode");
+    }
+  }
+}
+
+// Test whether Def is safe and profitable to rematerialize.
+static bool ShouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA,
+                                const WebAssemblyInstrInfo *TII) {
+  return Def.isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def, &AA);
+}
+
+// Identify the definition for this register at this point. This is a
+// generalization of MachineRegisterInfo::getUniqueVRegDef that uses
+// LiveIntervals to handle complex cases.
+static MachineInstr *GetVRegDef(unsigned Reg, const MachineInstr *Insert,
+                                const MachineRegisterInfo &MRI,
+                                const LiveIntervals &LIS)
+{
+  // Most registers are in SSA form here so we try a quick MRI query first.
+  if (MachineInstr *Def = MRI.getUniqueVRegDef(Reg))
+    return Def;
+
+  // MRI doesn't know what the Def is. Try asking LIS.
+  if (const VNInfo *ValNo = LIS.getInterval(Reg).getVNInfoBefore(
+          LIS.getInstructionIndex(*Insert)))
+    return LIS.getInstructionFromIndex(ValNo->def);
+
+  return nullptr;
+}
+
+// Test whether Reg, as defined at Def, has exactly one use. This is a
+// generalization of MachineRegisterInfo::hasOneUse that uses LiveIntervals
+// to handle complex cases.
+static bool HasOneUse(unsigned Reg, MachineInstr *Def,
+                      MachineRegisterInfo &MRI, MachineDominatorTree &MDT,
+                      LiveIntervals &LIS) {
+  // Most registers are in SSA form here so we try a quick MRI query first.
+  if (MRI.hasOneUse(Reg))
+    return true;
+
+  bool HasOne = false;
+  const LiveInterval &LI = LIS.getInterval(Reg);
+  const VNInfo *DefVNI = LI.getVNInfoAt(
+      LIS.getInstructionIndex(*Def).getRegSlot());
+  assert(DefVNI);
+  for (auto I : MRI.use_nodbg_operands(Reg)) {
+    const auto &Result = LI.Query(LIS.getInstructionIndex(*I.getParent()));
+    if (Result.valueIn() == DefVNI) {
+      if (!Result.isKill())
+        return false;
+      if (HasOne)
+        return false;
+      HasOne = true;
+    }
+  }
+  return HasOne;
+}
+
 // Test whether it's safe to move Def to just before Insert.
 // TODO: Compute memory dependencies in a way that doesn't require always
 // walking the block.
 // TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
 // more precise.
 static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
-                         AliasAnalysis &AA, LiveIntervals &LIS,
-                         MachineRegisterInfo &MRI) {
+                         AliasAnalysis &AA, const LiveIntervals &LIS,
+                         const MachineRegisterInfo &MRI) {
   assert(Def->getParent() == Insert->getParent());
-  bool SawStore = false, SawSideEffects = false;
-  MachineBasicBlock::const_iterator D(Def), I(Insert);
 
   // Check for register dependencies.
   for (const MachineOperand &MO : Def->operands()) {
@@ -106,6 +290,10 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
       continue;
 
     if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // Ignore ARGUMENTS; it's just used to keep the ARGUMENT_* instructions
+      // from moving down, and we've already checked for that.
+      if (Reg == WebAssembly::ARGUMENTS)
+        continue;
       // If the physical register is never modified, ignore it.
       if (!MRI.isPhysRegModified(Reg))
         continue;
@@ -114,24 +302,404 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
     }
 
     // Ask LiveIntervals whether moving this virtual register use or def to
-    // Insert will change value numbers are seen.
+    // Insert will change which value numbers are seen.
+    // 
+    // If the operand is a use of a register that is also defined in the same
+    // instruction, test that the newly defined value reaches the insert point,
+    // since the operand will be moving along with the def.
     const LiveInterval &LI = LIS.getInterval(Reg);
-    VNInfo *DefVNI = MO.isDef() ?
-        LI.getVNInfoAt(LIS.getInstructionIndex(Def).getRegSlot()) :
-        LI.getVNInfoBefore(LIS.getInstructionIndex(Def));
+    VNInfo *DefVNI =
+        (MO.isDef() || Def->definesRegister(Reg)) ?
+        LI.getVNInfoAt(LIS.getInstructionIndex(*Def).getRegSlot()) :
+        LI.getVNInfoBefore(LIS.getInstructionIndex(*Def));
     assert(DefVNI && "Instruction input missing value number");
-    VNInfo *InsVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(Insert));
+    VNInfo *InsVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(*Insert));
     if (InsVNI && DefVNI != InsVNI)
       return false;
   }
 
-  // Check for memory dependencies and side effects.
-  for (--I; I != D; --I)
-    SawSideEffects |= I->isSafeToMove(&AA, SawStore);
-  return !(SawStore && Def->mayLoad() && !Def->isInvariantLoad(&AA)) &&
-         !(SawSideEffects && !Def->isSafeToMove(&AA, SawStore));
+  bool Read = false, Write = false, Effects = false, StackPointer = false;
+  Query(*Def, AA, Read, Write, Effects, StackPointer);
+
+  // If the instruction does not access memory and has no side effects, it has
+  // no additional dependencies.
+  if (!Read && !Write && !Effects && !StackPointer)
+    return true;
+
+  // Scan through the intervening instructions between Def and Insert.
+  MachineBasicBlock::const_iterator D(Def), I(Insert);
+  for (--I; I != D; --I) {
+    bool InterveningRead = false;
+    bool InterveningWrite = false;
+    bool InterveningEffects = false;
+    bool InterveningStackPointer = false;
+    Query(*I, AA, InterveningRead, InterveningWrite, InterveningEffects,
+          InterveningStackPointer);
+    if (Effects && InterveningEffects)
+      return false;
+    if (Read && InterveningWrite)
+      return false;
+    if (Write && (InterveningRead || InterveningWrite))
+      return false;
+    if (StackPointer && InterveningStackPointer)
+      return false;
+  }
+
+  return true;
+}
+
+/// Test whether OneUse, a use of Reg, dominates all of Reg's other uses.
+static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
+                                     const MachineBasicBlock &MBB,
+                                     const MachineRegisterInfo &MRI,
+                                     const MachineDominatorTree &MDT,
+                                     LiveIntervals &LIS,
+                                     WebAssemblyFunctionInfo &MFI) {
+  const LiveInterval &LI = LIS.getInterval(Reg);
+
+  const MachineInstr *OneUseInst = OneUse.getParent();
+  VNInfo *OneUseVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(*OneUseInst));
+
+  for (const MachineOperand &Use : MRI.use_operands(Reg)) {
+    if (&Use == &OneUse)
+      continue;
+
+    const MachineInstr *UseInst = Use.getParent();
+    VNInfo *UseVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(*UseInst));
+
+    if (UseVNI != OneUseVNI)
+      continue;
+
+    const MachineInstr *OneUseInst = OneUse.getParent();
+    if (UseInst == OneUseInst) {
+      // Another use in the same instruction. We need to ensure that the one
+      // selected use happens "before" it.
+      if (&OneUse > &Use)
+        return false;
+    } else {
+      // Test that the use is dominated by the one selected use.
+      while (!MDT.dominates(OneUseInst, UseInst)) {
+        // Actually, dominating is over-conservative. Test that the use would
+        // happen after the one selected use in the stack evaluation order.
+        //
+        // This is needed as a consequence of using implicit get_locals for
+        // uses and implicit set_locals for defs.
+        if (UseInst->getDesc().getNumDefs() == 0) 
+          return false;
+        const MachineOperand &MO = UseInst->getOperand(0);
+        if (!MO.isReg())
+          return false;
+        unsigned DefReg = MO.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(DefReg) ||
+            !MFI.isVRegStackified(DefReg))
+          return false;
+        assert(MRI.hasOneUse(DefReg));
+        const MachineOperand &NewUse = *MRI.use_begin(DefReg);
+        const MachineInstr *NewUseInst = NewUse.getParent();
+        if (NewUseInst == OneUseInst) {
+          if (&OneUse > &NewUse)
+            return false;
+          break;
+        }
+        UseInst = NewUseInst;
+      }
+    }
+  }
+  return true;
+}
+
+/// Get the appropriate tee_local opcode for the given register class.
+static unsigned GetTeeLocalOpcode(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return WebAssembly::TEE_LOCAL_I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return WebAssembly::TEE_LOCAL_I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return WebAssembly::TEE_LOCAL_F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return WebAssembly::TEE_LOCAL_F64;
+  llvm_unreachable("Unexpected register class");
+}
+
+// Shrink LI to its uses, cleaning up LI.
+static void ShrinkToUses(LiveInterval &LI, LiveIntervals &LIS) {
+  if (LIS.shrinkToUses(&LI)) {
+    SmallVector<LiveInterval*, 4> SplitLIs;
+    LIS.splitSeparateComponents(LI, SplitLIs);
+  }
+}
+
+/// A single-use def in the same block with no intervening memory or register
+/// dependencies; move the def down and nest it with the current instruction.
+static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand& Op,
+                                      MachineInstr *Def,
+                                      MachineBasicBlock &MBB,
+                                      MachineInstr *Insert, LiveIntervals &LIS,
+                                      WebAssemblyFunctionInfo &MFI,
+                                      MachineRegisterInfo &MRI) {
+  DEBUG(dbgs() << "Move for single use: "; Def->dump());
+
+  MBB.splice(Insert, &MBB, Def);
+  LIS.handleMove(*Def);
+
+  if (MRI.hasOneDef(Reg) && MRI.hasOneUse(Reg)) {
+    // No one else is using this register for anything so we can just stackify
+    // it in place.
+    MFI.stackifyVReg(Reg);
+  } else {
+    // The register may have unrelated uses or defs; create a new register for
+    // just our one def and use so that we can stackify it.
+    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+    Def->getOperand(0).setReg(NewReg);
+    Op.setReg(NewReg);
+
+    // Tell LiveIntervals about the new register.
+    LIS.createAndComputeVirtRegInterval(NewReg);
+
+    // Tell LiveIntervals about the changes to the old register.
+    LiveInterval &LI = LIS.getInterval(Reg);
+    LI.removeSegment(LIS.getInstructionIndex(*Def).getRegSlot(),
+                     LIS.getInstructionIndex(*Op.getParent()).getRegSlot(),
+                     /*RemoveDeadValNo=*/true);
+
+    MFI.stackifyVReg(NewReg);
+
+    DEBUG(dbgs() << " - Replaced register: "; Def->dump());
+  }
+
+  ImposeStackOrdering(Def);
+  return Def;
+}
+
+/// A trivially cloneable instruction; clone it and nest the new copy with the
+/// current instruction.
+static MachineInstr *RematerializeCheapDef(
+    unsigned Reg, MachineOperand &Op, MachineInstr &Def, MachineBasicBlock &MBB,
+    MachineBasicBlock::instr_iterator Insert, LiveIntervals &LIS,
+    WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI,
+    const WebAssemblyInstrInfo *TII, const WebAssemblyRegisterInfo *TRI) {
+  DEBUG(dbgs() << "Rematerializing cheap def: "; Def.dump());
+  DEBUG(dbgs() << " - for use in "; Op.getParent()->dump());
+
+  unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+  TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI);
+  Op.setReg(NewReg);
+  MachineInstr *Clone = &*std::prev(Insert);
+  LIS.InsertMachineInstrInMaps(*Clone);
+  LIS.createAndComputeVirtRegInterval(NewReg);
+  MFI.stackifyVReg(NewReg);
+  ImposeStackOrdering(Clone);
+
+  DEBUG(dbgs() << " - Cloned to "; Clone->dump());
+
+  // Shrink the interval.
+  bool IsDead = MRI.use_empty(Reg);
+  if (!IsDead) {
+    LiveInterval &LI = LIS.getInterval(Reg);
+    ShrinkToUses(LI, LIS);
+    IsDead = !LI.liveAt(LIS.getInstructionIndex(Def).getDeadSlot());
+  }
+
+  // If that was the last use of the original, delete the original.
+  if (IsDead) {
+    DEBUG(dbgs() << " - Deleting original\n");
+    SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot();
+    LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
+    LIS.removeInterval(Reg);
+    LIS.RemoveMachineInstrFromMaps(Def);
+    Def.eraseFromParent();
+  }
+
+  return Clone;
 }
 
+/// A multiple-use def in the same block with no intervening memory or register
+/// dependencies; move the def down, nest it with the current instruction, and
+/// insert a tee_local to satisfy the rest of the uses. As an illustration,
+/// rewrite this:
+///
+///    Reg = INST ...        // Def
+///    INST ..., Reg, ...    // Insert
+///    INST ..., Reg, ...
+///    INST ..., Reg, ...
+///
+/// to this:
+///
+///    DefReg = INST ...     // Def (to become the new Insert)
+///    TeeReg, Reg = TEE_LOCAL_... DefReg
+///    INST ..., TeeReg, ... // Insert
+///    INST ..., Reg, ...
+///    INST ..., Reg, ...
+///
+/// with DefReg and TeeReg stackified. This eliminates a get_local from the
+/// resulting code.
+static MachineInstr *MoveAndTeeForMultiUse(
+    unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB,
+    MachineInstr *Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
+    MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII) {
+  DEBUG(dbgs() << "Move and tee for multi-use:"; Def->dump());
+
+  // Move Def into place.
+  MBB.splice(Insert, &MBB, Def);
+  LIS.handleMove(*Def);
+
+  // Create the Tee and attach the registers.
+  const auto *RegClass = MRI.getRegClass(Reg);
+  unsigned TeeReg = MRI.createVirtualRegister(RegClass);
+  unsigned DefReg = MRI.createVirtualRegister(RegClass);
+  MachineOperand &DefMO = Def->getOperand(0);
+  MachineInstr *Tee = BuildMI(MBB, Insert, Insert->getDebugLoc(),
+                              TII->get(GetTeeLocalOpcode(RegClass)), TeeReg)
+                          .addReg(Reg, RegState::Define)
+                          .addReg(DefReg, getUndefRegState(DefMO.isDead()));
+  Op.setReg(TeeReg);
+  DefMO.setReg(DefReg);
+  SlotIndex TeeIdx = LIS.InsertMachineInstrInMaps(*Tee).getRegSlot();
+  SlotIndex DefIdx = LIS.getInstructionIndex(*Def).getRegSlot();
+
+  // Tell LiveIntervals we moved the original vreg def from Def to Tee.
+  LiveInterval &LI = LIS.getInterval(Reg);
+  LiveInterval::iterator I = LI.FindSegmentContaining(DefIdx);
+  VNInfo *ValNo = LI.getVNInfoAt(DefIdx);
+  I->start = TeeIdx;
+  ValNo->def = TeeIdx;
+  ShrinkToUses(LI, LIS);
+
+  // Finish stackifying the new regs.
+  LIS.createAndComputeVirtRegInterval(TeeReg);
+  LIS.createAndComputeVirtRegInterval(DefReg);
+  MFI.stackifyVReg(DefReg);
+  MFI.stackifyVReg(TeeReg);
+  ImposeStackOrdering(Def);
+  ImposeStackOrdering(Tee);
+
+  DEBUG(dbgs() << " - Replaced register: "; Def->dump());
+  DEBUG(dbgs() << " - Tee instruction: "; Tee->dump());
+  return Def;
+}
+
+namespace {
+/// A stack for walking the tree of instructions being built, visiting the
+/// MachineOperands in DFS order.
+class TreeWalkerState {
+  typedef MachineInstr::mop_iterator mop_iterator;
+  typedef std::reverse_iterator<mop_iterator> mop_reverse_iterator;
+  typedef iterator_range<mop_reverse_iterator> RangeTy;
+  SmallVector<RangeTy, 4> Worklist;
+
+public:
+  explicit TreeWalkerState(MachineInstr *Insert) {
+    const iterator_range<mop_iterator> &Range = Insert->explicit_uses();
+    if (Range.begin() != Range.end())
+      Worklist.push_back(reverse(Range));
+  }
+
+  bool Done() const { return Worklist.empty(); }
+
+  MachineOperand &Pop() {
+    RangeTy &Range = Worklist.back();
+    MachineOperand &Op = *Range.begin();
+    Range = drop_begin(Range, 1);
+    if (Range.begin() == Range.end())
+      Worklist.pop_back();
+    assert((Worklist.empty() ||
+            Worklist.back().begin() != Worklist.back().end()) &&
+           "Empty ranges shouldn't remain in the worklist");
+    return Op;
+  }
+
+  /// Push Instr's operands onto the stack to be visited.
+  void PushOperands(MachineInstr *Instr) {
+    const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
+    if (Range.begin() != Range.end())
+      Worklist.push_back(reverse(Range));
+  }
+
+  /// Some of Instr's operands are on the top of the stack; remove them and
+  /// re-insert them starting from the beginning (because we've commuted them).
+  void ResetTopOperands(MachineInstr *Instr) {
+    assert(HasRemainingOperands(Instr) &&
+           "Reseting operands should only be done when the instruction has "
+           "an operand still on the stack");
+    Worklist.back() = reverse(Instr->explicit_uses());
+  }
+
+  /// Test whether Instr has operands remaining to be visited at the top of
+  /// the stack.
+  bool HasRemainingOperands(const MachineInstr *Instr) const {
+    if (Worklist.empty())
+      return false;
+    const RangeTy &Range = Worklist.back();
+    return Range.begin() != Range.end() && Range.begin()->getParent() == Instr;
+  }
+
+  /// Test whether the given register is present on the stack, indicating an
+  /// operand in the tree that we haven't visited yet. Moving a definition of
+  /// Reg to a point in the tree after that would change its value.
+  ///
+  /// This is needed as a consequence of using implicit get_locals for
+  /// uses and implicit set_locals for defs.
+  bool IsOnStack(unsigned Reg) const {
+    for (const RangeTy &Range : Worklist)
+      for (const MachineOperand &MO : Range)
+        if (MO.isReg() && MO.getReg() == Reg)
+          return true;
+    return false;
+  }
+};
+
+/// State to keep track of whether commuting is in flight or whether it's been
+/// tried for the current instruction and didn't work.
+class CommutingState {
+  /// There are effectively three states: the initial state where we haven't
+  /// started commuting anything and we don't know anything yet, the tenative
+  /// state where we've commuted the operands of the current instruction and are
+  /// revisting it, and the declined state where we've reverted the operands
+  /// back to their original order and will no longer commute it further.
+  bool TentativelyCommuting;
+  bool Declined;
+
+  /// During the tentative state, these hold the operand indices of the commuted
+  /// operands.
+  unsigned Operand0, Operand1;
+
+public:
+  CommutingState() : TentativelyCommuting(false), Declined(false) {}
+
+  /// Stackification for an operand was not successful due to ordering
+  /// constraints. If possible, and if we haven't already tried it and declined
+  /// it, commute Insert's operands and prepare to revisit it.
+  void MaybeCommute(MachineInstr *Insert, TreeWalkerState &TreeWalker,
+                    const WebAssemblyInstrInfo *TII) {
+    if (TentativelyCommuting) {
+      assert(!Declined &&
+             "Don't decline commuting until you've finished trying it");
+      // Commuting didn't help. Revert it.
+      TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
+      TentativelyCommuting = false;
+      Declined = true;
+    } else if (!Declined && TreeWalker.HasRemainingOperands(Insert)) {
+      Operand0 = TargetInstrInfo::CommuteAnyOperandIndex;
+      Operand1 = TargetInstrInfo::CommuteAnyOperandIndex;
+      if (TII->findCommutedOpIndices(*Insert, Operand0, Operand1)) {
+        // Tentatively commute the operands and try again.
+        TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
+        TreeWalker.ResetTopOperands(Insert);
+        TentativelyCommuting = true;
+        Declined = false;
+      }
+    }
+  }
+
+  /// Stackification for some operand was successful. Reset to the default
+  /// state.
+  void Reset() {
+    TentativelyCommuting = false;
+    Declined = false;
+  }
+};
+} // end anonymous namespace
+
 bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** Register Stackifying **********\n"
                   "********** Function: "
@@ -140,7 +708,10 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   MachineRegisterInfo &MRI = MF.getRegInfo();
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
   LiveIntervals &LIS = getAnalysis<LiveIntervals>();
 
   // Walk the instructions from the bottom up. Currently we don't look past
@@ -151,33 +722,37 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
     // iterating over it and the end iterator may change.
     for (auto MII = MBB.rbegin(); MII != MBB.rend(); ++MII) {
       MachineInstr *Insert = &*MII;
-      // Don't nest anything inside a phi.
-      if (Insert->getOpcode() == TargetOpcode::PHI)
-        break;
-
       // Don't nest anything inside an inline asm, because we don't have
       // constraints for $push inputs.
       if (Insert->getOpcode() == TargetOpcode::INLINEASM)
-        break;
+        continue;
+
+      // Ignore debugging intrinsics.
+      if (Insert->getOpcode() == TargetOpcode::DBG_VALUE)
+        continue;
 
       // Iterate through the inputs in reverse order, since we'll be pulling
       // operands off the stack in LIFO order.
-      bool AnyStackified = false;
-      for (MachineOperand &Op : reverse(Insert->uses())) {
+      CommutingState Commuting;
+      TreeWalkerState TreeWalker(Insert);
+      while (!TreeWalker.Done()) {
+        MachineOperand &Op = TreeWalker.Pop();
+
         // We're only interested in explicit virtual register operands.
-        if (!Op.isReg() || Op.isImplicit() || !Op.isUse())
+        if (!Op.isReg())
           continue;
 
         unsigned Reg = Op.getReg();
-
-        // Only consider registers with a single definition.
-        // TODO: Eventually we may relax this, to stackify phi transfers.
-        MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
-        if (!Def)
+        assert(Op.isUse() && "explicit_uses() should only iterate over uses");
+        assert(!Op.isImplicit() &&
+               "explicit_uses() should only iterate over explicit operands");
+        if (TargetRegisterInfo::isPhysicalRegister(Reg))
           continue;
 
-        // There's no use in nesting implicit defs inside anything.
-        if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+        // Identify the definition for this register at this point. Most
+        // registers are in SSA form here so we try a quick MRI query first.
+        MachineInstr *Def = GetVRegDef(Reg, Insert, MRI, LIS);
+        if (!Def)
           continue;
 
         // Don't nest an INLINE_ASM def into anything, because we don't have
@@ -185,10 +760,6 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         if (Def->getOpcode() == TargetOpcode::INLINEASM)
           continue;
 
-        // Don't nest PHIs inside of anything.
-        if (Def->getOpcode() == TargetOpcode::PHI)
-          continue;
-
         // Argument instructions represent live-in registers and not real
         // instructions.
         if (Def->getOpcode() == WebAssembly::ARGUMENT_I32 ||
@@ -197,38 +768,53 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
             Def->getOpcode() == WebAssembly::ARGUMENT_F64)
           continue;
 
-        // Single-use expression trees require defs that have one use.
-        // TODO: Eventually we'll relax this, to take advantage of set_local
-        // returning its result.
-        if (!MRI.hasOneUse(Reg))
-          continue;
-
-        // For now, be conservative and don't look across block boundaries.
-        // TODO: Be more aggressive?
-        if (Def->getParent() != &MBB)
+        // Decide which strategy to take. Prefer to move a single-use value
+        // over cloning it, and prefer cloning over introducing a tee_local.
+        // For moving, we require the def to be in the same block as the use;
+        // this makes things simpler (LiveIntervals' handleMove function only
+        // supports intra-block moves) and it's MachineSink's job to catch all
+        // the sinking opportunities anyway.
+        bool SameBlock = Def->getParent() == &MBB;
+        bool CanMove = SameBlock && IsSafeToMove(Def, Insert, AA, LIS, MRI) &&
+                       !TreeWalker.IsOnStack(Reg);
+        if (CanMove && HasOneUse(Reg, Def, MRI, MDT, LIS)) {
+          Insert = MoveForSingleUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI);
+        } else if (ShouldRematerialize(*Def, AA, TII)) {
+          Insert =
+              RematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(),
+                                    LIS, MFI, MRI, TII, TRI);
+        } else if (CanMove &&
+                   OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {
+          Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
+                                         MRI, TII);
+        } else {
+          // We failed to stackify the operand. If the problem was ordering
+          // constraints, Commuting may be able to help.
+          if (!CanMove && SameBlock)
+            Commuting.MaybeCommute(Insert, TreeWalker, TII);
+          // Proceed to the next operand.
           continue;
+        }
 
-        // Don't move instructions that have side effects or memory dependencies
-        // or other complications.
-        if (!IsSafeToMove(Def, Insert, AA, LIS, MRI))
-          continue;
+        // We stackified an operand. Add the defining instruction's operands to
+        // the worklist stack now to continue to build an ever deeper tree.
+        Commuting.Reset();
+        TreeWalker.PushOperands(Insert);
+      }
 
+      // If we stackified any operands, skip over the tree to start looking for
+      // the next instruction we can build a tree on.
+      if (Insert != &*MII) {
+        ImposeStackOrdering(&*MII);
+        MII = std::prev(
+            llvm::make_reverse_iterator(MachineBasicBlock::iterator(Insert)));
         Changed = true;
-        AnyStackified = true;
-        // Move the def down and nest it in the current instruction.
-        MBB.splice(Insert, &MBB, Def);
-        LIS.handleMove(Def);
-        MFI.stackifyVReg(Reg);
-        ImposeStackOrdering(Def);
-        Insert = Def;
       }
-      if (AnyStackified)
-        ImposeStackOrdering(&*MII);
     }
   }
 
-  // If we used EXPR_STACK anywhere, add it to the live-in sets everywhere
-  // so that it never looks like a use-before-def.
+  // If we used EXPR_STACK anywhere, add it to the live-in sets everywhere so
+  // that it never looks like a use-before-def.
   if (Changed) {
     MF.getRegInfo().addLiveIn(WebAssembly::EXPR_STACK);
     for (MachineBasicBlock &MBB : MF)
@@ -236,30 +822,30 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   }
 
 #ifndef NDEBUG
-  // Verify that pushes and pops are performed in FIFO order.
+  // Verify that pushes and pops are performed in LIFO order.
   SmallVector<unsigned, 0> Stack;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
+      if (MI.isDebugValue())
+        continue;
       for (MachineOperand &MO : reverse(MI.explicit_operands())) {
         if (!MO.isReg())
           continue;
-        unsigned VReg = MO.getReg();
-
-        // Don't stackify physregs like SP or FP.
-        if (!TargetRegisterInfo::isVirtualRegister(VReg))
-          continue;
+        unsigned Reg = MO.getReg();
 
-        if (MFI.isVRegStackified(VReg)) {
+        if (MFI.isVRegStackified(Reg)) {
           if (MO.isDef())
-            Stack.push_back(VReg);
+            Stack.push_back(Reg);
           else
-            assert(Stack.pop_back_val() == VReg);
+            assert(Stack.pop_back_val() == Reg &&
+                   "Register stack pop should be paired with a push");
         }
       }
     }
     // TODO: Generalize this code to support keeping values on the stack across
     // basic block boundaries.
-    assert(Stack.empty());
+    assert(Stack.empty() &&
+           "Register stack pushes and pops should be balanced");
   }
 #endif
 
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 90d8dda530ba..239fe89b7ef9 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -52,43 +52,74 @@ WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction & /*MF*/) const {
 }
 
 void WebAssemblyRegisterInfo::eliminateFrameIndex(
-    MachineBasicBlock::iterator II, int SPAdj,
-    unsigned FIOperandNum, RegScavenger * /*RS*/) const {
+    MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum,
+    RegScavenger * /*RS*/) const {
   assert(SPAdj == 0);
   MachineInstr &MI = *II;
 
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  const MachineFrameInfo& MFI = *MF.getFrameInfo();
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
   int64_t FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex);
 
-  if (MI.mayLoadOrStore()) {
-    // If this is a load or store, make it relative to SP and fold the frame
-    // offset directly in.
+  // If this is the address operand of a load or store, make it relative to SP
+  // and fold the frame offset directly in.
+  if (MI.mayLoadOrStore() && FIOperandNum == WebAssembly::MemOpAddressOperandNo) {
     assert(FrameOffset >= 0 && MI.getOperand(1).getImm() >= 0);
     int64_t Offset = MI.getOperand(1).getImm() + FrameOffset;
 
-    if (static_cast<uint64_t>(Offset) > std::numeric_limits<uint32_t>::max()) {
-      // If this happens the program is invalid, but better to error here than
-      // generate broken code.
-      report_fatal_error("Memory offset field overflow");
+    if (static_cast<uint64_t>(Offset) <= std::numeric_limits<uint32_t>::max()) {
+      MI.getOperand(FIOperandNum - 1).setImm(Offset);
+      MI.getOperand(FIOperandNum)
+          .ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false);
+      return;
     }
-    MI.getOperand(1).setImm(Offset);
-    MI.getOperand(2).ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false);
-  } else {
-    // Otherwise create an i32.add SP, offset and make it the operand.
-    auto &MRI = MF.getRegInfo();
-    const auto *TII = MF.getSubtarget().getInstrInfo();
+  }
+
+  // If this is an address being added to a constant, fold the frame offset
+  // into the constant.
+  if (MI.getOpcode() == WebAssembly::ADD_I32) {
+    MachineOperand &OtherMO = MI.getOperand(3 - FIOperandNum);
+    if (OtherMO.isReg()) {
+      unsigned OtherMOReg = OtherMO.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(OtherMOReg)) {
+        MachineInstr *Def = MF.getRegInfo().getUniqueVRegDef(OtherMOReg);
+        // TODO: For now we just opportunistically do this in the case where
+        // the CONST_I32 happens to have exactly one def and one use. We
+        // should generalize this to optimize in more cases.
+        if (Def && Def->getOpcode() == WebAssembly::CONST_I32 &&
+            MRI.hasOneNonDBGUse(Def->getOperand(0).getReg())) {
+          MachineOperand &ImmMO = Def->getOperand(1);
+          ImmMO.setImm(ImmMO.getImm() + uint32_t(FrameOffset));
+          MI.getOperand(FIOperandNum)
+              .ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false);
+          return;
+        }
+      }
+    }
+  }
+
+  // Otherwise create an i32.add SP, offset and make it the operand.
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
 
-    unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(WebAssembly::CONST_I32), OffsetReg)
+  unsigned FIRegOperand = WebAssembly::SP32;
+  if (FrameOffset) {
+    // Create i32.add SP, offset and make it the operand.
+    const TargetRegisterClass *PtrRC =
+        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+    unsigned OffsetOp = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::CONST_I32),
+            OffsetOp)
         .addImm(FrameOffset);
-    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(WebAssembly::ADD_I32), OffsetReg)
+    FIRegOperand = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::ADD_I32),
+            FIRegOperand)
         .addReg(WebAssembly::SP32)
-        .addReg(OffsetReg);
-    MI.getOperand(FIOperandNum).ChangeToRegister(OffsetReg, /*IsDef=*/false);
+        .addReg(OffsetOp);
   }
+  MI.getOperand(FIOperandNum).ChangeToRegister(FIRegOperand, /*IsDef=*/false);
 }
 
 unsigned
diff --git a/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
new file mode 100644
index 000000000000..11bda47eac56
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
@@ -0,0 +1,97 @@
+//===-- WebAssemblyReplacePhysRegs.cpp - Replace phys regs with virt regs -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass that replaces physical registers with
+/// virtual registers.
+///
+/// LLVM expects certain physical registers, such as a stack pointer. However,
+/// WebAssembly doesn't actually have such physical registers. This pass is run
+/// once LLVM no longer needs these registers, and replaces them with virtual
+/// registers, so they can participate in register stackifying and coloring in
+/// the normal way.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-replace-phys-regs"
+
+namespace {
+class WebAssemblyReplacePhysRegs final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyReplacePhysRegs() : MachineFunctionPass(ID) {}
+
+private:
+  const char *getPassName() const override {
+    return "WebAssembly Replace Physical Registers";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyReplacePhysRegs::ID = 0;
+FunctionPass *llvm::createWebAssemblyReplacePhysRegs() {
+  return new WebAssemblyReplacePhysRegs();
+}
+
+bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Replace Physical Registers **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const auto &TRI = *MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+  bool Changed = false;
+
+  assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
+         "LiveIntervals shouldn't be active yet!");
+  // We don't preserve SSA or liveness.
+  MRI.leaveSSA();
+  MRI.invalidateLiveness();
+
+  for (unsigned PReg = WebAssembly::NoRegister + 1;
+       PReg < WebAssembly::NUM_TARGET_REGS; ++PReg) {
+    // Skip fake registers that are never used explicitly.
+    if (PReg == WebAssembly::EXPR_STACK || PReg == WebAssembly::ARGUMENTS)
+      continue;
+
+    // Replace explicit uses of the physical register with a virtual register.
+    const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PReg);
+    unsigned VReg = WebAssembly::NoRegister;
+    for (auto I = MRI.reg_begin(PReg), E = MRI.reg_end(); I != E; ) {
+      MachineOperand &MO = *I++;
+      if (!MO.isImplicit()) {
+        if (VReg == WebAssembly::NoRegister)
+          VReg = MRI.createVirtualRegister(RC);
+        MO.setReg(VReg);
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index 13d96671276d..533c66b7a22f 100644
--- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -9,18 +9,18 @@
 ///
 /// \file
 /// \brief This file defines the WebAssembly subclass for
-/// TargetSelectionDAGInfo.
+/// SelectionDAGTargetInfo.
 ///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSELECTIONDAGINFO_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSELECTIONDAGINFO_H
 
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
 namespace llvm {
 
-class WebAssemblySelectionDAGInfo final : public TargetSelectionDAGInfo {
+class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo {
 public:
   ~WebAssemblySelectionDAGInfo() override;
 };
diff --git a/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
new file mode 100644
index 000000000000..4ebea68c58a5
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
@@ -0,0 +1,114 @@
+//=- WebAssemblySetP2AlignOperands.cpp - Set alignments on loads and stores -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file sets the p2align operands on load and store instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-set-p2align-operands"
+
+namespace {
+class WebAssemblySetP2AlignOperands final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblySetP2AlignOperands() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override {
+    return "WebAssembly Set p2align Operands";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblySetP2AlignOperands::ID = 0;
+FunctionPass *llvm::createWebAssemblySetP2AlignOperands() {
+  return new WebAssemblySetP2AlignOperands();
+}
+
+bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Set p2align Operands **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      switch (MI.getOpcode()) {
+      case WebAssembly::LOAD_I32:
+      case WebAssembly::LOAD_I64:
+      case WebAssembly::LOAD_F32:
+      case WebAssembly::LOAD_F64:
+      case WebAssembly::LOAD8_S_I32:
+      case WebAssembly::LOAD8_U_I32:
+      case WebAssembly::LOAD16_S_I32:
+      case WebAssembly::LOAD16_U_I32:
+      case WebAssembly::LOAD8_S_I64:
+      case WebAssembly::LOAD8_U_I64:
+      case WebAssembly::LOAD16_S_I64:
+      case WebAssembly::LOAD16_U_I64:
+      case WebAssembly::LOAD32_S_I64:
+      case WebAssembly::LOAD32_U_I64:
+      case WebAssembly::STORE_I32:
+      case WebAssembly::STORE_I64:
+      case WebAssembly::STORE_F32:
+      case WebAssembly::STORE_F64:
+      case WebAssembly::STORE8_I32:
+      case WebAssembly::STORE16_I32:
+      case WebAssembly::STORE8_I64:
+      case WebAssembly::STORE16_I64:
+      case WebAssembly::STORE32_I64: {
+        assert(MI.getOperand(3).getImm() == 0 &&
+               "ISel should set p2align operands to 0");
+        assert(MI.hasOneMemOperand() &&
+               "Load and store instructions have exactly one mem operand");
+        assert((*MI.memoperands_begin())->getSize() ==
+                   (UINT64_C(1)
+                    << WebAssembly::GetDefaultP2Align(MI.getOpcode())) &&
+               "Default p2align value should be natural");
+        assert(MI.getDesc().OpInfo[3].OperandType ==
+                   WebAssembly::OPERAND_P2ALIGN &&
+               "Load and store instructions should have a p2align operand");
+        uint64_t P2Align = Log2_64((*MI.memoperands_begin())->getAlignment());
+
+        // WebAssembly does not currently support supernatural alignment.
+        P2Align = std::min(
+            P2Align, uint64_t(WebAssembly::GetDefaultP2Align(MI.getOpcode())));
+
+        MI.getOperand(3).setImm(P2Align);
+        break;
+      }
+      default:
+        break;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index 4e08b2b079eb..1e9a773ae628 100644
--- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -17,12 +17,19 @@
 /// potentially also exposing the store to register stackifying. These both can
 /// reduce get_local/set_local traffic.
 ///
+/// This pass also performs this optimization for memcpy, memmove, and memset
+/// calls, since the LLVM intrinsics for these return void so they can't use the
+/// returned attribute and consequently aren't handled by the OptimizeReturned
+/// pass.
+///
 //===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -49,6 +56,10 @@ public:
     AU.addPreserved<MachineBlockFrequencyInfo>();
     AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -63,17 +74,127 @@ FunctionPass *llvm::createWebAssemblyStoreResults() {
   return new WebAssemblyStoreResults();
 }
 
+// Replace uses of FromReg with ToReg if they are dominated by MI.
+static bool ReplaceDominatedUses(MachineBasicBlock &MBB, MachineInstr &MI,
+                                 unsigned FromReg, unsigned ToReg,
+                                 const MachineRegisterInfo &MRI,
+                                 MachineDominatorTree &MDT,
+                                 LiveIntervals &LIS) {
+  bool Changed = false;
+
+  LiveInterval *FromLI = &LIS.getInterval(FromReg);
+  LiveInterval *ToLI = &LIS.getInterval(ToReg);
+
+  SlotIndex FromIdx = LIS.getInstructionIndex(MI).getRegSlot();
+  VNInfo *FromVNI = FromLI->getVNInfoAt(FromIdx);
+
+  SmallVector<SlotIndex, 4> Indices;
+
+  for (auto I = MRI.use_begin(FromReg), E = MRI.use_end(); I != E;) {
+    MachineOperand &O = *I++;
+    MachineInstr *Where = O.getParent();
+
+    // Check that MI dominates the instruction in the normal way.
+    if (&MI == Where || !MDT.dominates(&MI, Where))
+      continue;
+
+    // If this use gets a different value, skip it.
+    SlotIndex WhereIdx = LIS.getInstructionIndex(*Where);
+    VNInfo *WhereVNI = FromLI->getVNInfoAt(WhereIdx);
+    if (WhereVNI && WhereVNI != FromVNI)
+      continue;
+
+    // Make sure ToReg isn't clobbered before it gets there.
+    VNInfo *ToVNI = ToLI->getVNInfoAt(WhereIdx);
+    if (ToVNI && ToVNI != FromVNI)
+      continue;
+
+    Changed = true;
+    DEBUG(dbgs() << "Setting operand " << O << " in " << *Where << " from "
+                 << MI << "\n");
+    O.setReg(ToReg);
+
+    // If the store's def was previously dead, it is no longer.
+    if (!O.isUndef()) {
+      MI.getOperand(0).setIsDead(false);
+
+      Indices.push_back(WhereIdx.getRegSlot());
+    }
+  }
+
+  if (Changed) {
+    // Extend ToReg's liveness.
+    LIS.extendToIndices(*ToLI, Indices);
+
+    // Shrink FromReg's liveness.
+    LIS.shrinkToUses(FromLI);
+
+    // If we replaced all dominated uses, FromReg is now killed at MI.
+    if (!FromLI->liveAt(FromIdx.getDeadSlot()))
+      MI.addRegisterKilled(FromReg,
+                           MBB.getParent()->getSubtarget<WebAssemblySubtarget>()
+                                 .getRegisterInfo());
+  }
+
+  return Changed;
+}
+
+static bool optimizeStore(MachineBasicBlock &MBB, MachineInstr &MI,
+                          const MachineRegisterInfo &MRI,
+                          MachineDominatorTree &MDT,
+                          LiveIntervals &LIS) {
+  unsigned ToReg = MI.getOperand(0).getReg();
+  unsigned FromReg = MI.getOperand(WebAssembly::StoreValueOperandNo).getReg();
+  return ReplaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT, LIS);
+}
+
+static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
+                         const MachineRegisterInfo &MRI,
+                         MachineDominatorTree &MDT,
+                         LiveIntervals &LIS,
+                         const WebAssemblyTargetLowering &TLI,
+                         const TargetLibraryInfo &LibInfo) {
+  MachineOperand &Op1 = MI.getOperand(1);
+  if (!Op1.isSymbol())
+    return false;
+
+  StringRef Name(Op1.getSymbolName());
+  bool callReturnsInput = Name == TLI.getLibcallName(RTLIB::MEMCPY) ||
+                          Name == TLI.getLibcallName(RTLIB::MEMMOVE) ||
+                          Name == TLI.getLibcallName(RTLIB::MEMSET);
+  if (!callReturnsInput)
+    return false;
+
+  LibFunc::Func Func;
+  if (!LibInfo.getLibFunc(Name, Func))
+    return false;
+
+  unsigned FromReg = MI.getOperand(2).getReg();
+  unsigned ToReg = MI.getOperand(0).getReg();
+  if (MRI.getRegClass(FromReg) != MRI.getRegClass(ToReg))
+    report_fatal_error("Store results: call to builtin function with wrong "
+                       "signature, from/to mismatch");
+  return ReplaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT, LIS);
+}
+
 bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
   DEBUG({
     dbgs() << "********** Store Results **********\n"
            << "********** Function: " << MF.getName() << '\n';
   });
 
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+  const WebAssemblyTargetLowering &TLI =
+      *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
+  const auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
   bool Changed = false;
 
-  assert(MRI.isSSA() && "StoreResults depends on SSA form");
+  // We don't preserve SSA form.
+  MRI.leaveSSA();
+
+  assert(MRI.tracksLiveness() && "StoreResults expects liveness tracking");
 
   for (auto &MBB : MF) {
     DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
@@ -90,33 +211,12 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::STORE_F64:
       case WebAssembly::STORE_I32:
       case WebAssembly::STORE_I64:
-        unsigned ToReg = MI.getOperand(0).getReg();
-        unsigned FromReg = MI.getOperand(3).getReg();
-        for (auto I = MRI.use_begin(FromReg), E = MRI.use_end(); I != E;) {
-          MachineOperand &O = *I++;
-          MachineInstr *Where = O.getParent();
-          if (Where->getOpcode() == TargetOpcode::PHI) {
-            // PHIs use their operands on their incoming CFG edges rather than
-            // in their parent blocks. Get the basic block paired with this use
-            // of FromReg and check that MI's block dominates it.
-            MachineBasicBlock *Pred =
-                Where->getOperand(&O - &Where->getOperand(0) + 1).getMBB();
-            if (!MDT.dominates(&MBB, Pred))
-              continue;
-          } else {
-            // For a non-PHI, check that MI dominates the instruction in the
-            // normal way.
-            if (&MI == Where || !MDT.dominates(&MI, Where))
-              continue;
-          }
-          Changed = true;
-          DEBUG(dbgs() << "Setting operand " << O << " in " << *Where
-                       << " from " << MI << "\n");
-          O.setReg(ToReg);
-          // If the store's def was previously dead, it is no longer. But the
-          // dead flag shouldn't be set yet.
-          assert(!MI.getOperand(0).isDead() && "Dead flag set on store result");
-        }
+        Changed |= optimizeStore(MBB, MI, MRI, MDT, LIS);
+        break;
+      case WebAssembly::CALL_I32:
+      case WebAssembly::CALL_I64:
+        Changed |= optimizeCall(MBB, MI, MRI, MDT, LIS, TLI, LibInfo);
+        break;
       }
   }
 
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index cb2d5a63a19f..ce39051b0555 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -13,9 +13,9 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssemblyInstrInfo.h"
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssemblySubtarget.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyInstrInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
@@ -45,5 +45,11 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
       InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
       TLInfo(TM, *this) {}
 
-bool WebAssemblySubtarget::enableMachineScheduler() const { return true; }
+bool WebAssemblySubtarget::enableMachineScheduler() const {
+  // Disable the MachineScheduler for now. Even with ShouldTrackPressure set and
+  // enableMachineSchedDefaultSched overridden, it appears to have an overall
+  // negative effect for the kinds of register optimizations we're doing.
+  return false;
+}
+
 bool WebAssemblySubtarget::useAA() const { return true; }
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index b290b4bf7440..32154af3c1c2 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -20,8 +20,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
@@ -39,16 +39,23 @@ extern "C" void LLVMInitializeWebAssemblyTarget() {
 // WebAssembly Lowering public interface.
 //===----------------------------------------------------------------------===//
 
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::PIC_;
+  return *RM;
+}
+
 /// Create an WebAssembly architecture model.
 ///
 WebAssemblyTargetMachine::WebAssemblyTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
-    const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
-    CodeGenOpt::Level OL)
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    CodeModel::Model CM, CodeGenOpt::Level OL)
     : LLVMTargetMachine(T,
                         TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128"
                                          : "e-m:e-p:32:32-i64:64-n32:64-S128",
-                        TT, CPU, FS, Options, RM, CM, OL),
+                        TT, CPU, FS, Options, getEffectiveRelocModel(RM),
+                        CM, OL),
       TLOF(make_unique<WebAssemblyTargetObjectFile>()) {
   // WebAssembly type-checks expressions, but a noreturn function with a return
   // type that doesn't match the context will cause a check failure. So we lower
@@ -58,9 +65,9 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
 
   initAsmInfo();
 
-  // We need a reducible CFG, so disable some optimizations which tend to
-  // introduce irreducibility.
-  setRequiresStructuredCFG(true);
+  // Note that we don't use setRequiresStructuredCFG(true). It disables
+  // optimizations than we're ok with, and want, such as critical edge
+  // splitting and tail merging.
 }
 
 WebAssemblyTargetMachine::~WebAssemblyTargetMachine() {}
@@ -103,9 +110,8 @@ public:
 
   void addIRPasses() override;
   bool addInstSelector() override;
-  bool addILPOpts() override;
-  void addPreRegAlloc() override;
   void addPostRegAlloc() override;
+  bool addGCPasses() override { return false; }
   void addPreEmitPass() override;
 };
 } // end anonymous namespace
@@ -140,7 +146,8 @@ void WebAssemblyPassConfig::addIRPasses() {
     addPass(createAtomicExpandPass(TM));
 
   // Optimize "returned" function attributes.
-  addPass(createWebAssemblyOptimizeReturned());
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createWebAssemblyOptimizeReturned());
 
   TargetPassConfig::addIRPasses();
 }
@@ -153,58 +160,75 @@ bool WebAssemblyPassConfig::addInstSelector() {
   // so that we can fix up the ARGUMENT instructions before anything else
   // sees them in the wrong place.
   addPass(createWebAssemblyArgumentMove());
+  // Set the p2align operands. This information is present during ISel, however
+  // it's inconvenient to collect. Collect it now, and update the immediate
+  // operands.
+  addPass(createWebAssemblySetP2AlignOperands());
   return false;
 }
 
-bool WebAssemblyPassConfig::addILPOpts() {
-  (void)TargetPassConfig::addILPOpts();
-  return true;
-}
-
-void WebAssemblyPassConfig::addPreRegAlloc() {
-  TargetPassConfig::addPreRegAlloc();
-
-  // Prepare store instructions for register stackifying.
-  addPass(createWebAssemblyStoreResults());
-}
-
 void WebAssemblyPassConfig::addPostRegAlloc() {
   // TODO: The following CodeGen passes don't currently support code containing
   // virtual registers. Consider removing their restrictions and re-enabling
   // them.
-  //
-  // We use our own PrologEpilogInserter which is very slightly modified to
-  // tolerate virtual registers.
-  disablePass(&PrologEpilogCodeInserterID);
-  // Fails with: should be run after register allocation.
-  disablePass(&MachineCopyPropagationID);
 
-  // Mark registers as representing wasm's expression stack.
-  addPass(createWebAssemblyRegStackify());
+  // Has no asserts of its own, but was not written to handle virtual regs.
+  disablePass(&ShrinkWrapID);
 
-  // Run the register coloring pass to reduce the total number of registers.
-  addPass(createWebAssemblyRegColoring());
+  // These functions all require the AllVRegsAllocated property.
+  disablePass(&MachineCopyPropagationID);
+  disablePass(&PostRASchedulerID);
+  disablePass(&FuncletLayoutID);
+  disablePass(&StackMapLivenessID);
+  disablePass(&LiveDebugValuesID);
+  disablePass(&PatchableFunctionID);
 
   TargetPassConfig::addPostRegAlloc();
-
-  // Run WebAssembly's version of the PrologEpilogInserter. Target-independent
-  // PEI runs after PostRegAlloc and after ShrinkWrap. Putting it here will run
-  // PEI before ShrinkWrap but otherwise in the same position in the order.
-  addPass(createWebAssemblyPEI());
 }
 
 void WebAssemblyPassConfig::addPreEmitPass() {
   TargetPassConfig::addPreEmitPass();
 
+  // Now that we have a prologue and epilogue and all frame indices are
+  // rewritten, eliminate SP and FP. This allows them to be stackified,
+  // colored, and numbered with the rest of the registers.
+  addPass(createWebAssemblyReplacePhysRegs());
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    // LiveIntervals isn't commonly run this late. Re-establish preconditions.
+    addPass(createWebAssemblyPrepareForLiveIntervals());
+
+    // Depend on LiveIntervals and perform some optimizations on it.
+    addPass(createWebAssemblyOptimizeLiveIntervals());
+
+    // Prepare store instructions for register stackifying.
+    addPass(createWebAssemblyStoreResults());
+
+    // Mark registers as representing wasm's expression stack. This is a key
+    // code-compression technique in WebAssembly. We run this pass (and
+    // StoreResults above) very late, so that it sees as much code as possible,
+    // including code emitted by PEI and expanded by late tail duplication.
+    addPass(createWebAssemblyRegStackify());
+
+    // Run the register coloring pass to reduce the total number of registers.
+    // This runs after stackification so that it doesn't consider registers
+    // that become stackified.
+    addPass(createWebAssemblyRegColoring());
+  }
+
+  // Eliminate multiple-entry loops.
+  addPass(createWebAssemblyFixIrreducibleControlFlow());
+
   // Put the CFG in structured form; insert BLOCK and LOOP markers.
   addPass(createWebAssemblyCFGStackify());
 
   // Lower br_unless into br_if.
   addPass(createWebAssemblyLowerBrUnless());
 
+  // Perform the very last peephole optimizations on the code.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createWebAssemblyPeephole());
+
   // Create a mapping from LLVM CodeGen virtual registers to wasm registers.
   addPass(createWebAssemblyRegNumbering());
-
-  // Perform the very last peephole optimizations on the code.
-  addPass(createWebAssemblyPeephole());
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
index 3226edcdc614..52a2ef78736a 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
@@ -28,7 +28,7 @@ class WebAssemblyTargetMachine final : public LLVMTargetMachine {
 public:
   WebAssemblyTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                            StringRef FS, const TargetOptions &Options,
-                           Reloc::Model RM, CodeModel::Model CM,
+                           Optional<Reloc::Model> RM, CodeModel::Model CM,
                            CodeGenOpt::Level OL);
 
   ~WebAssemblyTargetMachine() override;
@@ -44,6 +44,8 @@ public:
 
   /// \brief Get the TargetIRAnalysis for this target.
   TargetIRAnalysis getTargetIRAnalysis() override;
+
+  bool usesPhysRegsForPEI() const override { return false; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 356631711921..bf546dab5fbb 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -25,3 +25,59 @@ WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   return TargetTransformInfo::PSK_FastHardware;
 }
+
+unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) {
+  unsigned Result = BaseT::getNumberOfRegisters(Vector);
+
+  // For SIMD, use at least 16 registers, as a rough guess.
+  if (Vector)
+    Result = std::max(Result, 16u);
+
+  return Result;
+}
+
+unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) {
+  if (Vector && getST()->hasSIMD128())
+    return 128;
+
+  return 64;
+}
+
+unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
+
+  unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
+      Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
+
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    switch (Opcode) {
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::Shl:
+      // SIMD128's shifts currently only accept a scalar shift count. For each
+      // element, we'll need to extract, op, insert. The following is a rough
+      // approxmation.
+      if (Opd2Info != TTI::OK_UniformValue &&
+          Opd2Info != TTI::OK_UniformConstantValue)
+        Cost = VTy->getNumElements() *
+               (TargetTransformInfo::TCC_Basic +
+                getArithmeticInstrCost(Opcode, VTy->getElementType()) +
+                TargetTransformInfo::TCC_Basic);
+      break;
+    }
+  }
+  return Cost;
+}
+
+unsigned WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                unsigned Index) {
+  unsigned Cost = BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index);
+
+  // SIMD128's insert/extract currently only take constant indices.
+  if (Index == -1u)
+    return Cost + 25 * TargetTransformInfo::TCC_Expensive;
+
+  return Cost;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 26dc388cc922..fe99e96eb3b8 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -61,7 +61,15 @@ public:
   /// \name Vector TTI Implementations
   /// @{
 
-  // TODO: Implement Vector TTI for WebAssembly
+  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   /// @}
 };
diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 91b3fff05dca..f07400021dc1 100644
--- a/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -1,253 +1,15 @@
 # Tests which are known to fail from the GCC torture test suite.
 
-# Core dump.
-920908-1.c
-pr38151.c
-va-arg-22.c
-
-# TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed.
-struct-ret-1.c
-va-arg-11.c
-va-arg-21.c
-va-arg-24.c
-va-arg-trap-1.c
-
-# WebAssemblyCFGStackify.cpp:211: void SortBlocks(llvm::MachineFunction&, const llvm::MachineLoopInfo&): Assertion `L->contains( MLI.getLoopFor(&*prev(MachineFunction::iterator(&MBB)))) && "Loop isn't contiguous"' failed.
-20000815-1.c
-20010129-1.c
-930628-1.c
-980707-1.c
-
-# WebAssemblyISelLowering.cpp:316: virtual llvm::SDValue llvm::WebAssemblyTargetLowering::LowerCall(llvm::TargetLowering::CallLoweringInfo&, llvm::SmallVectorImpl<llvm::SDValue>&) const: Assertion `!Out.Flags.isByVal() && "byval is not valid for return values"' failed.
-20030914-2.c
-20040703-1.c
-20081117-1.c
-920625-1.c
-931004-11.c
-931004-13.c
-980223.c
-bitfld-5.c
-complex-7.c
-pr38969.c
-pr51323.c
-pr52129.c
-pr57130.c
-
-# These were previously "Cannot select FrameIndex." Now most of them fail
-# because they contain call frame pseudos (e.g. call a vararg func),
-# frame pointers, or similar. This list will be updated again soon.
-20000519-1.c
-20000706-4.c
-20000706-5.c
-20000801-2.c
-20000801-4.c
-20011126-2.c
-
-20020529-1.c
-20021024-1.c
-
-20030828-1.c
-20030914-1.c
-
+# Computed gotos are not supported (Cannot select BlockAddress/BRIND)
 20040302-1.c
-20040625-1.c
-20040823-1.c
-
-20041113-1.c
-
-20041214-1.c
-
-20050826-2.c
-
-20071213-1.c
-
-20080506-2.c
-20080519-1.c
-
-20081103-1.c
-20090113-1.c
-20090113-2.c
-20090113-3.c
-
-20090623-1.c
-
-920501-6.c
-920501-8.c
-920726-1.c
-930518-1.c
-
-931004-10.c
-931004-12.c
-931004-14.c
-931004-2.c
-931004-4.c
-931004-6.c
-931004-8.c
-
-980205.c
-980608-1.c
-980709-1.c
-980716-1.c
-990127-1.c
-
-991216-2.c
-
-#cbrt.c
-complex-5.c
-complex-6.c
-
-enum-3.c
-fprintf-chk-1.c
-frame-address.c
-loop-15.c
-loop-ivopts-2.c
-mayalias-3.c
-
-multi-ix.c
-
-pr20466-1.c
-
-
-pr28778.c
-pr28982b.c
-
-pr30778.c
-pr31448-2.c
-pr31448.c
-
-pr33870-1.c
-pr33870.c
-
-pr38051.c
-
-pr39100.c
-
-pr39339.c
-
-pr43987.c
-
-pr44575.c
-
-pr44942.c
-pr46309.c
-pr47538.c
-pr47925.c
-
-pr49390.c
-pr49419.c
-
-#pr51877.c
-
-#pr52979-1.c
-#pr52979-2.c
-pr53645-2.c
-pr53645.c
-
-pr56205.c
-
-pr56866.c
-
-pr57876.c
-pr58277-1.c
-
-pr59643.c
-
-printf-chk-1.c
-pta-field-1.c
-pta-field-2.c
-
-stdarg-1.c
-stdarg-2.c
-stdarg-3.c
-stdarg-4.c
-strct-stdarg-1.c
-strct-varg-1.c
-
-va-arg-1.c
-va-arg-10.c
-va-arg-12.c
-va-arg-13.c
-va-arg-14.c
-va-arg-15.c
-va-arg-16.c
-va-arg-17.c
-va-arg-18.c
-va-arg-19.c
-va-arg-2.c
-va-arg-20.c
-va-arg-23.c
-va-arg-26.c
-va-arg-4.c
-va-arg-5.c
-va-arg-6.c
-va-arg-7.c
-va-arg-8.c
-va-arg-9.c
-va-arg-pack-1.c
-vfprintf-1.c
-vfprintf-chk-1.c
-vprintf-1.c
-vprintf-chk-1.c
-
-# Cannot select callseq_end.
-20040811-1.c
-pr43220.c
-vla-dealloc-1.c
-
-# Cannot select brind.
 20071210-1.c
 920501-4.c
 920501-5.c
-
-# Cannot select BlockAddress.
 comp-goto-1.c
 980526-1.c
 990208-1.c
 
-# WebAssembly hasn't implemented byval arguments.
-20000412-3.c
-20000419-1.c
-20000706-1.c
-20000706-2.c
-20000707-1.c
-20000717-1.c
-20000717-5.c
-20000808-1.c
-20010605-2.c
-20011113-1.c
-20020215-1.c
-20020810-1.c
-20021118-1.c
-20040707-1.c
-20040709-1.c
-20040709-2.c
-20041201-1.c
-20050713-1.c
-20070614-1.c
-920908-2.c
-921112-1.c
-921117-1.c
-921123-2.c
-921204-1.c
-930126-1.c
-930208-1.c
-931004-5.c
-931004-9.c
-931031-1.c
-950607-2.c
-960416-1.c
-990525-1.c
-991118-1.c
-bf64-1.c
-complex-1.c
-complex-2.c
-pr15262-2.c
-pr20621-1.c
-pr23135.c
-pr30185.c
-pr42248.c
-
-# unimplemented operation lowering.
+# WebAssembly hasn't implemented (will never?) __builtin_return_address
 20010122-1.c
 20030323-1.c
 20030811-1.c
@@ -255,7 +17,6 @@ pr17377.c
 
 # Error: invalid output constraint '=t' in asm.
 990413-2.c
-990826-0.c
 
 # Error: __builtin_setjmp / __builtin_longjmp is not supported for the current target.
 built-in-setjmp.c
@@ -300,10 +61,9 @@ pr51447.c
 20070919-1.c
 align-nest.c
 pr41935.c
-20050107-1.c
-20050119-1.c
-20050119-2.c
 920302-1.c
 920501-3.c
 920728-1.c
 pr28865.c
+widechar-2.c
+pr41463.c