860 files changed, 32133 insertions, 11025 deletions
diff --git a/.gitignore b/.gitignore
index f3424d231e2d..e3d191dcbcce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,7 +43,9 @@ autoconf/autom4te.cache
 # Directories to ignore (do not add trailing '/'s, they skip symlinks).
 #==============================================================================#
 # External projects that are tracked independently.
-projects/*/
+projects/*
+!projects/*.*
+!projects/Makefile
 # Clang, which is tracked independently.
 tools/clang
 # LLDB, which is tracked independently.
diff --git a/autoconf/config.guess b/autoconf/config.guess
index 73d671bc2c6b..b209a1a06c6f 100755
--- a/autoconf/config.guess
+++ b/autoconf/config.guess
@@ -810,6 +810,9 @@ EOF
     *:MINGW*:*)
 	echo ${UNAME_MACHINE}-pc-mingw32
 	exit ;;
+    *:MSYS*:*)
+	echo ${UNAME_MACHINE}-pc-msys
+	exit ;;
     i*:windows32*:*)
 	# uname -m includes "-pc" on this system.
 	echo ${UNAME_MACHINE}-mingw32
diff --git a/bindings/python/llvm/object.py b/bindings/python/llvm/object.py
index 4e912ed5da9d..b427113e9cea 100644
--- a/bindings/python/llvm/object.py
+++ b/bindings/python/llvm/object.py
@@ -372,14 +372,6 @@ class Relocation(LLVMObject):
         self.expired = False
 
     @CachedProperty
-    def address(self):
-        """The address of this relocation, in long bytes."""
-        if self.expired:
-            raise Exception('Relocation instance has expired.')
-
-        return lib.LLVMGetRelocationAddress(self)
-
-    @CachedProperty
     def offset(self):
         """The offset of this relocation, in long bytes."""
         if self.expired:
@@ -498,9 +490,6 @@ def register_library(library):
     library.LLVMGetSymbolSize.argtypes = [Symbol]
     library.LLVMGetSymbolSize.restype = c_uint64
 
-    library.LLVMGetRelocationAddress.argtypes = [c_object_p]
-    library.LLVMGetRelocationAddress.restype = c_uint64
-
     library.LLVMGetRelocationOffset.argtypes = [c_object_p]
     library.LLVMGetRelocationOffset.restype = c_uint64
 
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 4f60d9e6e7d6..45f6746948d2 100755
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -93,20 +93,9 @@ function(add_llvm_symbol_exports target_name export_file)
   else()
     set(native_export_file "${target_name}.def")
 
-    set(CAT "cat")
-    set(export_file_nativeslashes ${export_file})
-    if(WIN32 AND NOT CYGWIN AND NOT MSYS)
-      set(CAT "type")
-      # Convert ${export_file} to native format (backslashes) for "type"
-      # Does not use file(TO_NATIVE_PATH) as it doesn't create a native
-      # path but a build-system specific format (see CMake bug
-      # http://public.kitware.com/Bug/print_bug_page.php?bug_id=5939 )
-      string(REPLACE / \\ export_file_nativeslashes ${export_file})
-    endif()
-
     add_custom_command(OUTPUT ${native_export_file}
-      COMMAND ${CMAKE_COMMAND} -E echo "EXPORTS" > ${native_export_file}
-      COMMAND ${CAT} ${export_file_nativeslashes} >> ${native_export_file}
+      COMMAND ${PYTHON_EXECUTABLE} -c "import sys;print(''.join(['EXPORTS\\n']+sys.stdin.readlines(),))"
+        < ${export_file} > ${native_export_file}
       DEPENDS ${export_file}
       VERBATIM
       COMMENT "Creating export file for ${target_name}")
@@ -700,10 +689,18 @@ macro(add_llvm_external_project name)
   list(APPEND LLVM_IMPLICIT_PROJECT_IGNORE "${CMAKE_CURRENT_SOURCE_DIR}/${add_llvm_external_dir}")
   string(REPLACE "-" "_" nameUNDERSCORE ${name})
   string(TOUPPER ${nameUNDERSCORE} nameUPPER)
-  set(LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${add_llvm_external_dir}"
-      CACHE PATH "Path to ${name} source directory")
-  if (NOT ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR} STREQUAL ""
-      AND EXISTS ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}/CMakeLists.txt)
+  #TODO: Remove this check in a few days once it has circulated through
+  # buildbots and people's checkouts (cbieneman - July 14, 2015)
+  if("${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}/${add_llvm_external_dir}")
+    unset(LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR CACHE)
+  endif()
+  if(NOT LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR)
+    set(LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${add_llvm_external_dir}")
+  else()
+    set(LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR
+        CACHE PATH "Path to ${name} source directory")
+  endif()
+  if (EXISTS ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}/CMakeLists.txt)
     option(LLVM_EXTERNAL_${nameUPPER}_BUILD
            "Whether to build ${name} as part of LLVM" ON)
     if (LLVM_EXTERNAL_${nameUPPER}_BUILD)
diff --git a/docs/ExceptionHandling.rst b/docs/ExceptionHandling.rst
index 72ed78a3c99a..55ffdb45efe9 100644
--- a/docs/ExceptionHandling.rst
+++ b/docs/ExceptionHandling.rst
@@ -339,11 +339,11 @@ original context before code generation.
 
 Catch handlers are called with a pointer to the handler itself as the first
 argument and a pointer to the parent function's stack frame as the second
-argument.  The catch handler uses the `llvm.recoverframe
-<LangRef.html#llvm-frameallocate-and-llvm-framerecover-intrinsics>`_ to get a
+argument.  The catch handler uses the `llvm.localrecover
+<LangRef.html#llvm-localescape-and-llvm-localrecover-intrinsics>`_ to get a
 pointer to a frame allocation block that is created in the parent frame using
-the `llvm.allocateframe 
-<LangRef.html#llvm-frameallocate-and-llvm-framerecover-intrinsics>`_ intrinsic.
+the `llvm.localescape
+<LangRef.html#llvm-localescape-and-llvm-localrecover-intrinsics>`_ intrinsic.
 The ``WinEHPrepare`` pass will have created a structure definition for the
 contents of this block.  The first two members of the structure will always be
 (1) a 32-bit integer that the runtime uses to track the exception state of the
@@ -520,12 +520,12 @@ action.
 A code of ``i32 1`` indicates a catch action, which expects three additional
 arguments. Different EH schemes give different meanings to the three arguments,
 but the first argument indicates whether the catch should fire, the second is
-the frameescape index of the exception object, and the third is the code to run
+the localescape index of the exception object, and the third is the code to run
 to catch the exception.
 
 For Windows C++ exception handling, the first argument for a catch handler is a
 pointer to the RTTI type descriptor for the object to catch. The second
-argument is an index into the argument list of the ``llvm.frameescape`` call in
+argument is an index into the argument list of the ``llvm.localescape`` call in
 the main function. The exception object will be copied into the provided stack
 object. If the exception object is not required, this argument should be -1.
 The third argument is a pointer to a function implementing the catch.  This
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 167280f75f7f..e7d6f67c9399 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -1326,6 +1326,14 @@ example:
     On an argument, this attribute indicates that the function does not write
     through this pointer argument, even though it may write to the memory that
     the pointer points to.
+``argmemonly``
+    This attribute indicates that the only memory accesses inside function are
+    loads and stores from objects pointed to by its pointer-typed arguments,
+    with arbitrary offsets. Or in other words, all memory operations in the
+    function can refer to memory only using pointers based on its function
+    arguments.
+    Note that ``argmemonly`` can be used together with ``readonly`` attribute
+    in order to specify that function reads only from its arguments.
 ``returns_twice``
     This attribute indicates that this function can return twice. The C
     ``setjmp`` is an example of such a function. The compiler disables
@@ -1446,8 +1454,8 @@ The strings can contain any character by escaping non-printable
 characters. The escape sequence used is simply "\\xx" where "xx" is the
 two digit hex code for the number.
 
-The inline asm code is simply printed to the machine code .s file when
-assembly code is generated.
+Note that the assembly string *must* be parseable by LLVM's integrated assembler
+(unless it is disabled), even when emitting a ``.s`` file.
 
 .. _langref_datalayout:
 
@@ -1837,8 +1845,8 @@ Fast-Math Flags
 
 LLVM IR floating-point binary ops (:ref:`fadd <i_fadd>`,
 :ref:`fsub <i_fsub>`, :ref:`fmul <i_fmul>`, :ref:`fdiv <i_fdiv>`,
-:ref:`frem <i_frem>`) have the following flags that can be set to enable
-otherwise unsafe floating point operations
+:ref:`frem <i_frem>`, :ref:`fcmp <i_fcmp>`) have the following flags that can
+be set to enable otherwise unsafe floating point operations
 
 ``nnan``
    No NaNs - Allow optimizations to assume the arguments and result are not
@@ -2800,13 +2808,36 @@ Inline Assembler Expressions
 ----------------------------
 
 LLVM supports inline assembler expressions (as opposed to :ref:`Module-Level
-Inline Assembly <moduleasm>`) through the use of a special value. This
-value represents the inline assembler as a string (containing the
-instructions to emit), a list of operand constraints (stored as a
-string), a flag that indicates whether or not the inline asm expression
-has side effects, and a flag indicating whether the function containing
-the asm needs to align its stack conservatively. An example inline
-assembler expression is:
+Inline Assembly <moduleasm>`) through the use of a special value. This value
+represents the inline assembler as a template string (containing the
+instructions to emit), a list of operand constraints (stored as a string), a
+flag that indicates whether or not the inline asm expression has side effects,
+and a flag indicating whether the function containing the asm needs to align its
+stack conservatively.
+
+The template string supports argument substitution of the operands using "``$``"
+followed by a number, to indicate substitution of the given register/memory
+location, as specified by the constraint string. "``${NUM:MODIFIER}``" may also
+be used, where ``MODIFIER`` is a target-specific annotation for how to print the
+operand (See :ref:`inline-asm-modifiers`).
+
+A literal "``$``" may be included by using "``$$``" in the template. To include
+other special characters into the output, the usual "``\XX``" escapes may be
+used, just as in other strings. Note that after template substitution, the
+resulting assembly string is parsed by LLVM's integrated assembler unless it is
+disabled -- even when emitting a ``.s`` file -- and thus must contain assembly
+syntax known to LLVM.
+
+LLVM's support for inline asm is modeled closely on the requirements of Clang's
+GCC-compatible inline-asm support. Thus, the feature-set and the constraint and
+modifier codes listed here are similar or identical to those in GCC's inline asm
+support. However, to be clear, the syntax of the template and constraint strings
+described here is *not* the same as the syntax accepted by GCC and Clang, and,
+while most constraint letters are passed through as-is by Clang, some get
+translated to other codes when converting from the C source to the LLVM
+assembly.
+
+An example inline assembler expression is:
 
 .. code-block:: llvm
 
@@ -2852,6 +2883,596 @@ If multiple keywords appear the '``sideeffect``' keyword must come
 first, the '``alignstack``' keyword second and the '``inteldialect``'
 keyword last.
 
+Inline Asm Constraint String
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The constraint list is a comma-separated string, each element containing one or
+more constraint codes.
+
+For each element in the constraint list an appropriate register or memory
+operand will be chosen, and it will be made available to assembly template
+string expansion as ``$0`` for the first constraint in the list, ``$1`` for the
+second, etc.
+
+There are three different types of constraints, which are distinguished by a
+prefix symbol in front of the constraint code: Output, Input, and Clobber. The
+constraints must always be given in that order: outputs first, then inputs, then
+clobbers. They cannot be intermingled.
+
+There are also three different categories of constraint codes:
+
+- Register constraint. This is either a register class, or a fixed physical
+  register. This kind of constraint will allocate a register, and if necessary,
+  bitcast the argument or result to the appropriate type.
+- Memory constraint. This kind of constraint is for use with an instruction
+  taking a memory operand. Different constraints allow for different addressing
+  modes used by the target.
+- Immediate value constraint. This kind of constraint is for an integer or other
+  immediate value which can be rendered directly into an instruction. The
+  various target-specific constraints allow the selection of a value in the
+  proper range for the instruction you wish to use it with.
+
+Output constraints
+""""""""""""""""""
+
+Output constraints are specified by an "``=``" prefix (e.g. "``=r``"). This
+indicates that the assembly will write to this operand, and the operand will
+then be made available as a return value of the ``asm`` expression. Output
+constraints do not consume an argument from the call instruction. (Except, see
+below about indirect outputs).
+
+Normally, it is expected that no output locations are written to by the assembly
+expression until *all* of the inputs have been read. As such, LLVM may assign
+the same register to an output and an input. If this is not safe (e.g. if the
+assembly contains two instructions, where the first writes to one output, and
+the second reads an input and writes to a second output), then the "``&``"
+modifier must be used (e.g. "``=&r``") to specify that the output is an
+"early-clobber" output. Marking an ouput as "early-clobber" ensures that LLVM
+will not use the same register for any inputs (other than an input tied to this
+output).
+
+Input constraints
+"""""""""""""""""
+
+Input constraints do not have a prefix -- just the constraint codes. Each input
+constraint will consume one argument from the call instruction. It is not
+permitted for the asm to write to any input register or memory location (unless
+that input is tied to an output). Note also that multiple inputs may all be
+assigned to the same register, if LLVM can determine that they necessarily all
+contain the same value.
+
+Instead of providing a Constraint Code, input constraints may also "tie"
+themselves to an output constraint, by providing an integer as the constraint
+string. Tied inputs still consume an argument from the call instruction, and
+take up a position in the asm template numbering as is usual -- they will simply
+be constrained to always use the same register as the output they've been tied
+to. For example, a constraint string of "``=r,0``" says to assign a register for
+output, and use that register as an input as well (it being the 0'th
+constraint).
+
+It is permitted to tie an input to an "early-clobber" output. In that case, no
+*other* input may share the same register as the input tied to the early-clobber
+(even when the other input has the same value).
+
+You may only tie an input to an output which has a register constraint, not a
+memory constraint. Only a single input may be tied to an output.
+
+There is also an "interesting" feature which deserves a bit of explanation: if a
+register class constraint allocates a register which is too small for the value
+type operand provided as input, the input value will be split into multiple
+registers, and all of them passed to the inline asm.
+
+However, this feature is often not as useful as you might think.
+
+Firstly, the registers are *not* guaranteed to be consecutive. So, on those
+architectures that have instructions which operate on multiple consecutive
+instructions, this is not an appropriate way to support them. (e.g. the 32-bit
+SparcV8 has a 64-bit load, which instruction takes a single 32-bit register. The
+hardware then loads into both the named register, and the next register. This
+feature of inline asm would not be useful to support that.)
+
+A few of the targets provide a template string modifier allowing explicit access
+to the second register of a two-register operand (e.g. MIPS ``L``, ``M``, and
+``D``). On such an architecture, you can actually access the second allocated
+register (yet, still, not any subsequent ones). But, in that case, you're still
+probably better off simply splitting the value into two separate operands, for
+clarity. (e.g. see the description of the ``A`` constraint on X86, which,
+despite existing only for use with this feature, is not really a good idea to
+use)
+
+Indirect inputs and outputs
+"""""""""""""""""""""""""""
+
+Indirect output or input constraints can be specified by the "``*``" modifier
+(which goes after the "``=``" in case of an output). This indicates that the asm
+will write to or read from the contents of an *address* provided as an input
+argument. (Note that in this way, indirect outputs act more like an *input* than
+an output: just like an input, they consume an argument of the call expression,
+rather than producing a return value. An indirect output constraint is an
+"output" only in that the asm is expected to write to the contents of the input
+memory location, instead of just read from it).
+
+This is most typically used for memory constraint, e.g. "``=*m``", to pass the
+address of a variable as a value.
+
+It is also possible to use an indirect *register* constraint, but only on output
+(e.g. "``=*r``"). This will cause LLVM to allocate a register for an output
+value normally, and then, separately emit a store to the address provided as
+input, after the provided inline asm. (It's not clear what value this
+functionality provides, compared to writing the store explicitly after the asm
+statement, and it can only produce worse code, since it bypasses many
+optimization passes. I would recommend not using it.)
+
+
+Clobber constraints
+"""""""""""""""""""
+
+A clobber constraint is indicated by a "``~``" prefix. A clobber does not
+consume an input operand, nor generate an output. Clobbers cannot use any of the
+general constraint code letters -- they may use only explicit register
+constraints, e.g. "``~{eax}``". The one exception is that a clobber string of
+"``~{memory}``" indicates that the assembly writes to arbitrary undeclared
+memory locations -- not only the memory pointed to by a declared indirect
+output.
+
+
+Constraint Codes
+""""""""""""""""
+After a potential prefix comes constraint code, or codes.
+
+A Constraint Code is either a single letter (e.g. "``r``"), a "``^``" character
+followed by two letters (e.g. "``^wc``"), or "``{``" register-name "``}``"
+(e.g. "``{eax}``").
+
+The one and two letter constraint codes are typically chosen to be the same as
+GCC's constraint codes.
+
+A single constraint may include one or more than constraint code in it, leaving
+it up to LLVM to choose which one to use. This is included mainly for
+compatibility with the translation of GCC inline asm coming from clang.
+
+There are two ways to specify alternatives, and either or both may be used in an
+inline asm constraint list:
+
+1) Append the codes to each other, making a constraint code set. E.g. "``im``"
+   or "``{eax}m``". This means "choose any of the options in the set". The
+   choice of constraint is made independently for each constraint in the
+   constraint list.
+
+2) Use "``|``" between constraint code sets, creating alternatives. Every
+   constraint in the constraint list must have the same number of alternative
+   sets. With this syntax, the same alternative in *all* of the items in the
+   constraint list will be chosen together.
+
+Putting those together, you might have a two operand constraint string like
+``"rm|r,ri|rm"``. This indicates that if operand 0 is ``r`` or ``m``, then
+operand 1 may be one of ``r`` or ``i``. If operand 0 is ``r``, then operand 1
+may be one of ``r`` or ``m``. But, operand 0 and 1 cannot both be of type m.
+
+However, the use of either of the alternatives features is *NOT* recommended, as
+LLVM is not able to make an intelligent choice about which one to use. (At the
+point it currently needs to choose, not enough information is available to do so
+in a smart way.) Thus, it simply tries to make a choice that's most likely to
+compile, not one that will be optimal performance. (e.g., given "``rm``", it'll
+always choose to use memory, not registers). And, if given multiple registers,
+or multiple register classes, it will simply choose the first one. (In fact, it
+doesn't currently even ensure explicitly specified physical registers are
+unique, so specifying multiple physical registers as alternatives, like
+``{r11}{r12},{r11}{r12}``, will assign r11 to both operands, not at all what was
+intended.)
+
+Supported Constraint Code List
+""""""""""""""""""""""""""""""
+
+The constraint codes are, in general, expected to behave the same way they do in
+GCC. LLVM's support is often implemented on an 'as-needed' basis, to support C
+inline asm code which was supported by GCC. A mismatch in behavior between LLVM
+and GCC likely indicates a bug in LLVM.
+
+Some constraint codes are typically supported by all targets:
+
+- ``r``: A register in the target's general purpose register class.
+- ``m``: A memory address operand. It is target-specific what addressing modes
+  are supported, typical examples are register, or register + register offset,
+  or register + immediate offset (of some target-specific size).
+- ``i``: An integer constant (of target-specific width). Allows either a simple
+  immediate, or a relocatable value.
+- ``n``: An integer constant -- *not* including relocatable values.
+- ``s``: An integer constant, but allowing *only* relocatable values.
+- ``X``: Allows an operand of any kind, no constraint whatsoever. Typically
+  useful to pass a label for an asm branch or call.
+
+  .. FIXME: but that surely isn't actually okay to jump out of an asm
+     block without telling llvm about the control transfer???)
+
+- ``{register-name}``: Requires exactly the named physical register.
+
+Other constraints are target-specific:
+
+AArch64:
+
+- ``z``: An immediate integer 0. Outputs ``WZR`` or ``XZR``, as appropriate.
+- ``I``: An immediate integer valid for an ``ADD`` or ``SUB`` instruction,
+  i.e. 0 to 4095 with optional shift by 12.
+- ``J``: An immediate integer that, when negated, is valid for an ``ADD`` or
+  ``SUB`` instruction, i.e. -1 to -4095 with optional left shift by 12.
+- ``K``: An immediate integer that is valid for the 'bitmask immediate 32' of a
+  logical instruction like ``AND``, ``EOR``, or ``ORR`` with a 32-bit register.
+- ``L``: An immediate integer that is valid for the 'bitmask immediate 64' of a
+  logical instruction like ``AND``, ``EOR``, or ``ORR`` with a 64-bit register.
+- ``M``: An immediate integer for use with the ``MOV`` assembly alias on a
+  32-bit register. This is a superset of ``K``: in addition to the bitmask
+  immediate, also allows immediate integers which can be loaded with a single
+  ``MOVZ`` or ``MOVL`` instruction.
+- ``N``: An immediate integer for use with the ``MOV`` assembly alias on a
+  64-bit register. This is a superset of ``L``.
+- ``Q``: Memory address operand must be in a single register (no
+  offsets). (However, LLVM currently does this for the ``m`` constraint as
+  well.)
+- ``r``: A 32 or 64-bit integer register (W* or X*).
+- ``w``: A 32, 64, or 128-bit floating-point/SIMD register.
+- ``x``: A lower 128-bit floating-point/SIMD register (``V0`` to ``V15``).
+
+AMDGPU:
+
+- ``r``: A 32 or 64-bit integer register.
+- ``[0-9]v``: The 32-bit VGPR register, number 0-9.
+- ``[0-9]s``: The 32-bit SGPR register, number 0-9.
+
+
+All ARM modes:
+
+- ``Q``, ``Um``, ``Un``, ``Uq``, ``Us``, ``Ut``, ``Uv``, ``Uy``: Memory address
+  operand. Treated the same as operand ``m``, at the moment.
+
+ARM and ARM's Thumb2 mode:
+
+- ``j``: An immediate integer between 0 and 65535 (valid for ``MOVW``)
+- ``I``: An immediate integer valid for a data-processing instruction.
+- ``J``: An immediate integer between -4095 and 4095.
+- ``K``: An immediate integer whose bitwise inverse is valid for a
+  data-processing instruction. (Can be used with template modifier "``B``" to
+  print the inverted value).
+- ``L``: An immediate integer whose negation is valid for a data-processing
+  instruction. (Can be used with template modifier "``n``" to print the negated
+  value).
+- ``M``: A power of two or a integer between 0 and 32.
+- ``N``: Invalid immediate constraint.
+- ``O``: Invalid immediate constraint.
+- ``r``: A general-purpose 32-bit integer register (``r0-r15``).
+- ``l``: In Thumb2 mode, low 32-bit GPR registers (``r0-r7``). In ARM mode, same
+  as ``r``.
+- ``h``: In Thumb2 mode, a high 32-bit GPR register (``r8-r15``). In ARM mode,
+  invalid.
+- ``w``: A 32, 64, or 128-bit floating-point/SIMD register: ``s0-s31``,
+  ``d0-d31``, or ``q0-q15``.
+- ``x``: A 32, 64, or 128-bit floating-point/SIMD register: ``s0-s15``,
+  ``d0-d7``, or ``q0-q3``.
+- ``t``: A floating-point/SIMD register, only supports 32-bit values:
+  ``s0-s31``.
+
+ARM's Thumb1 mode:
+
+- ``I``: An immediate integer between 0 and 255.
+- ``J``: An immediate integer between -255 and -1.
+- ``K``: An immediate integer between 0 and 255, with optional left-shift by
+  some amount.
+- ``L``: An immediate integer between -7 and 7.
+- ``M``: An immediate integer which is a multiple of 4 between 0 and 1020.
+- ``N``: An immediate integer between 0 and 31.
+- ``O``: An immediate integer which is a multiple of 4 between -508 and 508.
+- ``r``: A low 32-bit GPR register (``r0-r7``).
+- ``l``: A low 32-bit GPR register (``r0-r7``).
+- ``h``: A high GPR register (``r0-r7``).
+- ``w``: A 32, 64, or 128-bit floating-point/SIMD register: ``s0-s31``,
+  ``d0-d31``, or ``q0-q15``.
+- ``x``: A 32, 64, or 128-bit floating-point/SIMD register: ``s0-s15``,
+  ``d0-d7``, or ``q0-q3``.
+- ``t``: A floating-point/SIMD register, only supports 32-bit values:
+  ``s0-s31``.
+
+
+Hexagon:
+
+- ``o``, ``v``: A memory address operand, treated the same as constraint ``m``,
+  at the moment.
+- ``r``: A 32 or 64-bit register.
+
+MSP430:
+
+- ``r``: An 8 or 16-bit register.
+
+MIPS:
+
+- ``I``: An immediate signed 16-bit integer.
+- ``J``: An immediate integer zero.
+- ``K``: An immediate unsigned 16-bit integer.
+- ``L``: An immediate 32-bit integer, where the lower 16 bits are 0.
+- ``N``: An immediate integer between -65535 and -1.
+- ``O``: An immediate signed 15-bit integer.
+- ``P``: An immediate integer between 1 and 65535.
+- ``m``: A memory address operand. In MIPS-SE mode, allows a base address
+  register plus 16-bit immediate offset. In MIPS mode, just a base register.
+- ``R``: A memory address operand. In MIPS-SE mode, allows a base address
+  register plus a 9-bit signed offset. In MIPS mode, the same as constraint
+  ``m``.
+- ``ZC``: A memory address operand, suitable for use in a ``pref``, ``ll``, or
+  ``sc`` instruction on the given subtarget (details vary).
+- ``r``, ``d``,  ``y``: A 32 or 64-bit GPR register.
+- ``f``: A 32 or 64-bit FPU register (``F0-F31``), or a 128-bit MSA register
+  (``W0-W31``). In the case of MSA registers, it is recommended to use the ``w``
+  argument modifier for compatibility with GCC.
+- ``c``: A 32-bit or 64-bit GPR register suitable for indirect jump (always
+  ``25``).
+- ``l``: The ``lo`` register, 32 or 64-bit.
+- ``x``: Invalid.
+
+NVPTX:
+
+- ``b``: A 1-bit integer register.
+- ``c`` or ``h``: A 16-bit integer register.
+- ``r``: A 32-bit integer register.
+- ``l`` or ``N``: A 64-bit integer register.
+- ``f``: A 32-bit float register.
+- ``d``: A 64-bit float register.
+
+
+PowerPC:
+
+- ``I``: An immediate signed 16-bit integer.
+- ``J``: An immediate unsigned 16-bit integer, shifted left 16 bits.
+- ``K``: An immediate unsigned 16-bit integer.
+- ``L``: An immediate signed 16-bit integer, shifted left 16 bits.
+- ``M``: An immediate integer greater than 31.
+- ``N``: An immediate integer that is an exact power of 2.
+- ``O``: The immediate integer constant 0.
+- ``P``: An immediate integer constant whose negation is a signed 16-bit
+  constant.
+- ``es``, ``o``, ``Q``, ``Z``, ``Zy``: A memory address operand, currently
+  treated the same as ``m``.
+- ``r``: A 32 or 64-bit integer register.
+- ``b``: A 32 or 64-bit integer register, excluding ``R0`` (that is:
+  ``R1-R31``).
+- ``f``: A 32 or 64-bit float register (``F0-F31``), or when QPX is enabled, a
+  128 or 256-bit QPX register (``Q0-Q31``; aliases the ``F`` registers).
+- ``v``: For ``4 x f32`` or ``4 x f64`` types, when QPX is enabled, a
+  128 or 256-bit QPX register (``Q0-Q31``), otherwise a 128-bit
+  altivec vector register (``V0-V31``).
+
+  .. FIXME: is this a bug that v accepts QPX registers? I think this
+     is supposed to only use the altivec vector registers?
+
+- ``y``: Condition register (``CR0-CR7``).
+- ``wc``: An individual CR bit in a CR register.
+- ``wa``, ``wd``, ``wf``: Any 128-bit VSX vector register, from the full VSX
+  register set (overlapping both the floating-point and vector register files).
+- ``ws``: A 32 or 64-bit floating point register, from the full VSX register
+  set.
+
+Sparc:
+
+- ``I``: An immediate 13-bit signed integer.
+- ``r``: A 32-bit integer register.
+
+SystemZ:
+
+- ``I``: An immediate unsigned 8-bit integer.
+- ``J``: An immediate unsigned 12-bit integer.
+- ``K``: An immediate signed 16-bit integer.
+- ``L``: An immediate signed 20-bit integer.
+- ``M``: An immediate integer 0x7fffffff.
+- ``Q``, ``R``, ``S``, ``T``: A memory address operand, treated the same as
+  ``m``, at the moment.
+- ``r`` or ``d``: A 32, 64, or 128-bit integer register.
+- ``a``: A 32, 64, or 128-bit integer address register (excludes R0, which in an
+  address context evaluates as zero).
+- ``h``: A 32-bit value in the high part of a 64bit data register
+  (LLVM-specific)
+- ``f``: A 32, 64, or 128-bit floating point register.
+
+X86:
+
+- ``I``: An immediate integer between 0 and 31.
+- ``J``: An immediate integer between 0 and 64.
+- ``K``: An immediate signed 8-bit integer.
+- ``L``: An immediate integer, 0xff or 0xffff or (in 64-bit mode only)
+  0xffffffff.
+- ``M``: An immediate integer between 0 and 3.
+- ``N``: An immediate unsigned 8-bit integer.
+- ``O``: An immediate integer between 0 and 127.
+- ``e``: An immediate 32-bit signed integer.
+- ``Z``: An immediate 32-bit unsigned integer.
+- ``o``, ``v``: Treated the same as ``m``, at the moment.
+- ``q``: An 8, 16, 32, or 64-bit register which can be accessed as an 8-bit
+  ``l`` integer register. On X86-32, this is the ``a``, ``b``, ``c``, and ``d``
+  registers, and on X86-64, it is all of the integer registers.
+- ``Q``: An 8, 16, 32, or 64-bit register which can be accessed as an 8-bit
+  ``h`` integer register. This is the ``a``, ``b``, ``c``, and ``d`` registers.
+- ``r`` or ``l``: An 8, 16, 32, or 64-bit integer register.
+- ``R``: An 8, 16, 32, or 64-bit "legacy" integer register -- one which has
+  existed since i386, and can be accessed without the REX prefix.
+- ``f``: A 32, 64, or 80-bit '387 FPU stack pseudo-register.
+- ``y``: A 64-bit MMX register, if MMX is enabled.
+- ``x``: If SSE is enabled: a 32 or 64-bit scalar operand, or 128-bit vector
+  operand in a SSE register. If AVX is also enabled, can also be a 256-bit
+  vector operand in an AVX register. If AVX-512 is also enabled, can also be a
+  512-bit vector operand in an AVX512 register, Otherwise, an error.
+- ``Y``: The same as ``x``, if *SSE2* is enabled, otherwise an error.
+- ``A``: Special case: allocates EAX first, then EDX, for a single operand (in
+  32-bit mode, a 64-bit integer operand will get split into two registers). It
+  is not recommended to use this constraint, as in 64-bit mode, the 64-bit
+  operand will get allocated only to RAX -- if two 32-bit operands are needed,
+  you're better off splitting it yourself, before passing it to the asm
+  statement.
+
+XCore:
+
+- ``r``: A 32-bit integer register.
+
+
+.. _inline-asm-modifiers:
+
+Asm template argument modifiers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In the asm template string, modifiers can be used on the operand reference, like
+"``${0:n}``".
+
+The modifiers are, in general, expected to behave the same way they do in
+GCC. LLVM's support is often implemented on an 'as-needed' basis, to support C
+inline asm code which was supported by GCC. A mismatch in behavior between LLVM
+and GCC likely indicates a bug in LLVM.
+
+Target-independent:
+
+- ``c``: Print an immediate integer constant  unadorned, without
+  the target-specific immediate punctuation (e.g. no ``$`` prefix).
+- ``n``: Negate and print immediate integer constant unadorned, without the
+  target-specific immediate punctuation (e.g. no ``$`` prefix).
+- ``l``: Print as an unadorned label, without the target-specific label
+  punctuation (e.g. no ``$`` prefix).
+
+AArch64:
+
+- ``w``: Print a GPR register with a ``w*`` name instead of ``x*`` name. E.g.,
+  instead of ``x30``, print ``w30``.
+- ``x``: Print a GPR register with a ``x*`` name. (this is the default, anyhow).
+- ``b``, ``h``, ``s``, ``d``, ``q``: Print a floating-point/SIMD register with a
+  ``b*``, ``h*``, ``s*``, ``d*``, or ``q*`` name, rather than the default of
+  ``v*``.
+
+AMDGPU:
+
+- ``r``: No effect.
+
+ARM:
+
+- ``a``: Print an operand as an address (with ``[`` and ``]`` surrounding a
+  register).
+- ``P``: No effect.
+- ``q``: No effect.
+- ``y``: Print a VFP single-precision register as an indexed double (e.g. print
+  as ``d4[1]`` instead of ``s9``)
+- ``B``: Bitwise invert and print an immediate integer constant without ``#``
+  prefix.
+- ``L``: Print the low 16-bits of an immediate integer constant.
+- ``M``: Print as a register set suitable for ldm/stm. Also prints *all*
+  register operands subsequent to the specified one (!), so use carefully.
+- ``Q``: Print the low-order register of a register-pair, or the low-order
+  register of a two-register operand.
+- ``R``: Print the high-order register of a register-pair, or the high-order
+  register of a two-register operand.
+- ``H``: Print the second register of a register-pair. (On a big-endian system,
+  ``H`` is equivalent to ``Q``, and on little-endian system, ``H`` is equivalent
+  to ``R``.)
+
+  .. FIXME: H doesn't currently support printing the second register
+     of a two-register operand.
+
+- ``e``: Print the low doubleword register of a NEON quad register.
+- ``f``: Print the high doubleword register of a NEON quad register.
+- ``m``: Print the base register of a memory operand without the ``[`` and ``]``
+  adornment.
+
+Hexagon:
+
+- ``L``: Print the second register of a two-register operand. Requires that it
+  has been allocated consecutively to the first.
+
+  .. FIXME: why is it restricted to consecutive ones? And there's
+     nothing that ensures that happens, is there?
+
+- ``I``: Print the letter 'i' if the operand is an integer constant, otherwise
+  nothing. Used to print 'addi' vs 'add' instructions.
+
+MSP430:
+
+No additional modifiers.
+
+MIPS:
+
+- ``X``: Print an immediate integer as hexadecimal
+- ``x``: Print the low 16 bits of an immediate integer as hexadecimal.
+- ``d``: Print an immediate integer as decimal.
+- ``m``: Subtract one and print an immediate integer as decimal.
+- ``z``: Print $0 if an immediate zero, otherwise print normally.
+- ``L``: Print the low-order register of a two-register operand, or prints the
+  address of the low-order word of a double-word memory operand.
+
+  .. FIXME: L seems to be missing memory operand support.
+
+- ``M``: Print the high-order register of a two-register operand, or prints the
+  address of the high-order word of a double-word memory operand.
+
+  .. FIXME: M seems to be missing memory operand support.
+
+- ``D``: Print the second register of a two-register operand, or prints the
+  second word of a double-word memory operand. (On a big-endian system, ``D`` is
+  equivalent to ``L``, and on little-endian system, ``D`` is equivalent to
+  ``M``.)
+- ``w``: No effect. Provided for compatibility with GCC which requires this
+  modifier in order to print MSA registers (``W0-W31``) with the ``f``
+  constraint.
+
+NVPTX:
+
+- ``r``: No effect.
+
+PowerPC:
+
+- ``L``: Print the second register of a two-register operand. Requires that it
+  has been allocated consecutively to the first.
+
+  .. FIXME: why is it restricted to consecutive ones? And there's
+     nothing that ensures that happens, is there?
+
+- ``I``: Print the letter 'i' if the operand is an integer constant, otherwise
+  nothing. Used to print 'addi' vs 'add' instructions.
+- ``y``: For a memory operand, prints formatter for a two-register X-form
+  instruction. (Currently always prints ``r0,OPERAND``).
+- ``U``: Prints 'u' if the memory operand is an update form, and nothing
+  otherwise. (NOTE: LLVM does not support update form, so this will currently
+  always print nothing)
+- ``X``: Prints 'x' if the memory operand is an indexed form. (NOTE: LLVM does
+  not support indexed form, so this will currently always print nothing)
+
+Sparc:
+
+- ``r``: No effect.
+
+SystemZ:
+
+SystemZ implements only ``n``, and does *not* support any of the other
+target-independent modifiers.
+
+X86:
+
+- ``c``: Print an unadorned integer or symbol name. (The latter is
+  target-specific behavior for this typically target-independent modifier).
+- ``A``: Print a register name with a '``*``' before it.
+- ``b``: Print an 8-bit register name (e.g. ``al``); do nothing on a memory
+  operand.
+- ``h``: Print the upper 8-bit register name (e.g. ``ah``); do nothing on a
+  memory operand.
+- ``w``: Print the 16-bit register name (e.g. ``ax``); do nothing on a memory
+  operand.
+- ``k``: Print the 32-bit register name (e.g. ``eax``); do nothing on a memory
+  operand.
+- ``q``: Print the 64-bit register name (e.g. ``rax``), if 64-bit registers are
+  available, otherwise the 32-bit register name; do nothing on a memory operand.
+- ``n``: Negate and print an unadorned integer, or, for operands other than an
+  immediate integer (e.g. a relocatable symbol expression), print a '-' before
+  the operand. (The behavior for relocatable symbol expressions is a
+  target-specific behavior for this typically target-independent modifier)
+- ``H``: Print a memory reference with additional offset +8.
+- ``P``: Print a memory reference or operand for use as the argument of a call
+  instruction. (E.g. omit ``(rip)``, even though it's PC-relative.)
+
+XCore:
+
+No additional modifiers.
+
+
 Inline Asm Metadata
 ^^^^^^^^^^^^^^^^^^^
 
@@ -6108,7 +6729,8 @@ Overview:
 
 The '``getelementptr``' instruction is used to get the address of a
 subelement of an :ref:`aggregate <t_aggregate>` data structure. It performs
-address calculation only and does not access memory.
+address calculation only and does not access memory. The instruction can also
+be used to calculate a vector of such addresses.
 
 Arguments:
 """"""""""
@@ -6234,12 +6856,61 @@ Example:
         ; yields i32*:iptr
         %iptr = getelementptr [10 x i32], [10 x i32]* @arr, i16 0, i16 0
 
-In cases where the pointer argument is a vector of pointers, each index
-must be a vector with the same number of elements. For example:
+Vector of pointers:
+"""""""""""""""""""
+
+The ``getelementptr`` returns a vector of pointers, instead of a single address,
+when one or more of its arguments is a vector. In such cases, all vector
+arguments should have the same number of elements, and every scalar argument
+will be effectively broadcast into a vector during address calculation.
+
+.. code-block:: llvm
+
+     ; All arguments are vectors:
+     ;   A[i] = ptrs[i] + offsets[i]*sizeof(i8)
+     %A = getelementptr i8, <4 x i8*> %ptrs, <4 x i64> %offsets
+     
+     ; Add the same scalar offset to each pointer of a vector:
+     ;   A[i] = ptrs[i] + offset*sizeof(i8)
+     %A = getelementptr i8, <4 x i8*> %ptrs, i64 %offset
+     
+     ; Add distinct offsets to the same pointer:
+     ;   A[i] = ptr + offsets[i]*sizeof(i8)
+     %A = getelementptr i8, i8* %ptr, <4 x i64> %offsets
+     
+     ; In all cases described above the type of the result is <4 x i8*>
+
+The two following instructions are equivalent:
 
 .. code-block:: llvm
 
-     %A = getelementptr i8, <4 x i8*> %ptrs, <4 x i64> %offsets,
+     getelementptr  %struct.ST, <4 x %struct.ST*> %s, <4 x i64> %ind1,
+       <4 x i32> <i32 2, i32 2, i32 2, i32 2>,
+       <4 x i32> <i32 1, i32 1, i32 1, i32 1>,
+       <4 x i32> %ind4,
+       <4 x i64> <i64 13, i64 13, i64 13, i64 13>
+     
+     getelementptr  %struct.ST, <4 x %struct.ST*> %s, <4 x i64> %ind1,
+       i32 2, i32 1, <4 x i32> %ind4, i64 13
+
+Let's look at the C code, where the vector version of ``getelementptr``
+makes sense:
+
+.. code-block:: c
+
+    // Let's assume that we vectorize the following loop:
+    double *A, B; int *C;
+    for (int i = 0; i < size; ++i) {
+      A[i] = B[C[i]];
+    }
+
+.. code-block:: llvm
+
+    ; get pointers for 8 elements from array B
+    %ptrs = getelementptr double, double* %B, <8 x i32> %C
+    ; load 8 elements from array B into A
+    %A = call <8 x double> @llvm.masked.gather.v8f64(<8 x double*> %ptrs,
+         i32 8, <8 x i1> %mask, <8 x double> %passthru)
 
 Conversion Operations
 ---------------------
@@ -6913,7 +7584,7 @@ Syntax:
 
 ::
 
-      <result> = fcmp <cond> <ty> <op1>, <op2>     ; yields i1 or <N x i1>:result
+      <result> = fcmp [fast-math flags]* <cond> <ty> <op1>, <op2>     ; yields i1 or <N x i1>:result
 
 Overview:
 """""""""
@@ -6996,6 +7667,15 @@ always yields an :ref:`i1 <t_integer>` result, as follows:
 #. ``uno``: yields ``true`` if either operand is a QNAN.
 #. ``true``: always yields ``true``, regardless of operands.
 
+The ``fcmp`` instruction can also optionally take any number of
+:ref:`fast-math flags <fastmath>`, which are optimization hints to enable
+otherwise unsafe floating point optimizations.
+
+Any set of fast-math flags are legal on an ``fcmp`` instruction, but the
+only flags that have any effect on its semantics are those that allow
+assumptions to be made about the values of input arguments; namely
+``nnan``, ``ninf``, and ``nsz``. See :ref:`fastmath` for more information.
+
 Example:
 """"""""
 
@@ -7780,7 +8460,7 @@ Note that calling this intrinsic does not prevent function inlining or
 other aggressive transformations, so the value returned may not be that
 of the obvious source-language caller.
 
-'``llvm.frameescape``' and '``llvm.framerecover``' Intrinsics
+'``llvm.localescape``' and '``llvm.localrecover``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -7788,49 +8468,47 @@ Syntax:
 
 ::
 
-      declare void @llvm.frameescape(...)
-      declare i8* @llvm.framerecover(i8* %func, i8* %fp, i32 %idx)
+      declare void @llvm.localescape(...)
+      declare i8* @llvm.localrecover(i8* %func, i8* %fp, i32 %idx)
 
 Overview:
 """""""""
 
-The '``llvm.frameescape``' intrinsic escapes offsets of a collection of static
-allocas, and the '``llvm.framerecover``' intrinsic applies those offsets to a
+The '``llvm.localescape``' intrinsic escapes offsets of a collection of static
+allocas, and the '``llvm.localrecover``' intrinsic applies those offsets to a
 live frame pointer to recover the address of the allocation. The offset is
-computed during frame layout of the caller of ``llvm.frameescape``.
+computed during frame layout of the caller of ``llvm.localescape``.
 
 Arguments:
 """"""""""
 
-All arguments to '``llvm.frameescape``' must be pointers to static allocas or
-casts of static allocas. Each function can only call '``llvm.frameescape``'
+All arguments to '``llvm.localescape``' must be pointers to static allocas or
+casts of static allocas. Each function can only call '``llvm.localescape``'
 once, and it can only do so from the entry block.
 
-The ``func`` argument to '``llvm.framerecover``' must be a constant
+The ``func`` argument to '``llvm.localrecover``' must be a constant
 bitcasted pointer to a function defined in the current module. The code
 generator cannot determine the frame allocation offset of functions defined in
 other modules.
 
-The ``fp`` argument to '``llvm.framerecover``' must be a frame
-pointer of a call frame that is currently live. The return value of
-'``llvm.frameaddress``' is one way to produce such a value, but most platforms
-also expose the frame pointer through stack unwinding mechanisms.
+The ``fp`` argument to '``llvm.localrecover``' must be a frame pointer of a
+call frame that is currently live. The return value of '``llvm.localaddress``'
+is one way to produce such a value, but various runtimes also expose a suitable
+pointer in platform-specific ways.
 
-The ``idx`` argument to '``llvm.framerecover``' indicates which alloca passed to
-'``llvm.frameescape``' to recover. It is zero-indexed.
+The ``idx`` argument to '``llvm.localrecover``' indicates which alloca passed to
+'``llvm.localescape``' to recover. It is zero-indexed.
 
 Semantics:
 """"""""""
 
-These intrinsics allow a group of functions to access one stack memory
-allocation in an ancestor stack frame. The memory returned from
-'``llvm.frameallocate``' may be allocated prior to stack realignment, so the
-memory is only aligned to the ABI-required stack alignment.  Each function may
-only call '``llvm.frameallocate``' one or zero times from the function entry
-block.  The frame allocation intrinsic inhibits inlining, as any frame
-allocations in the inlined function frame are likely to be at a different
-offset from the one used by '``llvm.framerecover``' called with the
-uninlined function.
+These intrinsics allow a group of functions to share access to a set of local
+stack allocations of a one parent function. The parent function may call the
+'``llvm.localescape``' intrinsic once from the function entry block, and the
+child functions can use '``llvm.localrecover``' to access the escaped allocas.
+The '``llvm.localescape``' intrinsic blocks inlining, as inlining changes where
+the escaped allocas are allocated, which would break attempts to use
+'``llvm.localrecover``'.
 
 .. _int_read_register:
 .. _int_write_register:
@@ -9532,6 +10210,75 @@ Examples:
 Specialised Arithmetic Intrinsics
 ---------------------------------
 
+'``llvm.canonicalize.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare float @llvm.canonicalize.f32(float %a)
+      declare double @llvm.canonicalize.f64(double %b)
+
+Overview:
+"""""""""
+
+The '``llvm.canonicalize.*``' intrinsic returns the platform specific canonical
+encoding of a floating point number.  This canonicalization is useful for
+implementing certain numeric primitives such as frexp. The canonical encoding is
+defined by IEEE-754-2008 to be:
+
+::
+
+      2.1.8 canonical encoding: The preferred encoding of a floating-point
+      representation in a format.  Applied to declets, significands of finite
+      numbers, infinities, and NaNs, especially in decimal formats.
+
+This operation can also be considered equivalent to the IEEE-754-2008
+conversion of a floating-point value to the same format.  NaNs are handled
+according to section 6.2.
+
+Examples of non-canonical encodings:
+
+- x87 pseudo denormals, pseudo NaNs, pseudo Infinity, Unnormals.  These are
+  converted to a canonical representation per hardware-specific protocol.
+- Many normal decimal floating point numbers have non-canonical alternative
+  encodings.
+- Some machines, like GPUs or ARMv7 NEON, do not support subnormal values.
+  These are treated as non-canonical encodings of zero and with be flushed to
+  a zero of the same sign by this operation.
+
+Note that per IEEE-754-2008 6.2, systems that support signaling NaNs with
+default exception handling must signal an invalid exception, and produce a
+quiet NaN result.
+
+This function should always be implementable as multiplication by 1.0, provided
+that the compiler does not constant fold the operation.  Likewise, division by
+1.0 and ``llvm.minnum(x, x)`` are possible implementations.  Addition with
+-0.0 is also sufficient provided that the rounding mode is not -Infinity.
+
+``@llvm.canonicalize`` must preserve the equality relation.  That is:
+
+- ``(@llvm.canonicalize(x) == x)`` is equivalent to ``(x == x)``
+- ``(@llvm.canonicalize(x) == @llvm.canonicalize(y))`` is equivalent to
+  to ``(x == y)``
+
+Additionally, the sign of zero must be conserved:
+``@llvm.canonicalize(-0.0) = -0.0`` and ``@llvm.canonicalize(+0.0) = +0.0``
+
+The payload bits of a NaN must be conserved, with two exceptions.
+First, environments which use only a single canonical representation of NaN
+must perform said canonicalization.  Second, SNaNs must be quieted per the
+usual methods.
+
+The canonicalization operation may be optimized away if:
+
+- The input is known to be canonical.  For example, it was produced by a
+  floating-point operation that is required by the standard to be canonical.
+- The result is consumed only by (or fused with) other floating-point
+  operations.  That is, the bits of the floating point value are not examined.
+
 '``llvm.fmuladd.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst
index ceb39e18efdf..08cc61a187b5 100644
--- a/docs/ProgrammersManual.rst
+++ b/docs/ProgrammersManual.rst
@@ -1868,7 +1868,7 @@ Iterating over predecessors & successors of blocks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Iterating over the predecessors and successors of a block is quite easy with the
-routines defined in ``"llvm/Support/CFG.h"``.  Just use code like this to
+routines defined in ``"llvm/IR/CFG.h"``.  Just use code like this to
 iterate over all predecessors of BB:
 
 .. code-block:: c++
diff --git a/docs/StackMaps.rst b/docs/StackMaps.rst
index 43c60c9e7850..dbdf78f992ca 100644
--- a/docs/StackMaps.rst
+++ b/docs/StackMaps.rst
@@ -221,12 +221,10 @@ lowered according to the calling convention specified at the
 intrinsic's callsite. Variants of the intrinsic with non-void return
 type also return a value according to calling convention.
 
-On PowerPC, note that ``<target>`` must be the actual intended target of
-the indirect call. Specifically, even when compiling for the ELF V1 ABI,
-``<target>`` is not the function-descriptor address normally used as the C/C++
-function-pointer representation. As a result, the call target must be local
-because no adjustment or restoration of the TOC pointer (in register r2) will
-be performed.
+On PowerPC, note that ``<target>`` must be the ABI function pointer for the
+intended target of the indirect call. Specifically, when compiling for the
+ELF V1 ABI, ``<target>`` is the function-descriptor address normally used as
+the C/C++ function-pointer representation.
 
 Requesting zero patch point arguments is valid. In this case, all
 variable operands are handled just like
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 713894f57639..15290072abe8 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -1888,6 +1888,20 @@ LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee,
 void LLVMDeleteFunction(LLVMValueRef Fn);
 
 /**
+ * Obtain the personality function attached to the function.
+ *
+ * @see llvm::Function::getPersonalityFn()
+ */
+LLVMValueRef LLVMGetPersonalityFn(LLVMValueRef Fn);
+
+/**
+ * Set the personality function attached to the function.
+ *
+ * @see llvm::Function::setPersonalityFn()
+ */
+void LLVMSetPersonalityFn(LLVMValueRef Fn, LLVMValueRef PersonalityFn);
+
+/**
  * Obtain the ID number from a function instance.
  *
  * @see llvm::Function::getIntrinsicID()
diff --git a/include/llvm-c/Object.h b/include/llvm-c/Object.h
index 447fcea7bc24..9cab5c426c45 100644
--- a/include/llvm-c/Object.h
+++ b/include/llvm-c/Object.h
@@ -81,7 +81,6 @@ uint64_t LLVMGetSymbolAddress(LLVMSymbolIteratorRef SI);
 uint64_t LLVMGetSymbolSize(LLVMSymbolIteratorRef SI);
 
 // RelocationRef accessors
-uint64_t LLVMGetRelocationAddress(LLVMRelocationIteratorRef RI);
 uint64_t LLVMGetRelocationOffset(LLVMRelocationIteratorRef RI);
 LLVMSymbolIteratorRef LLVMGetRelocationSymbol(LLVMRelocationIteratorRef RI);
 uint64_t LLVMGetRelocationType(LLVMRelocationIteratorRef RI);
diff --git a/include/llvm-c/lto.h b/include/llvm-c/lto.h
index 42c05a2fc59e..cb3a69160454 100644
--- a/include/llvm-c/lto.h
+++ b/include/llvm-c/lto.h
@@ -40,7 +40,7 @@ typedef bool lto_bool_t;
  * @{
  */
 
-#define LTO_API_VERSION 16
+#define LTO_API_VERSION 17
 
 /**
  * \since prior to LTO_API_VERSION=3
@@ -63,7 +63,8 @@ typedef enum {
     LTO_SYMBOL_SCOPE_PROTECTED             = 0x00002000,
     LTO_SYMBOL_SCOPE_DEFAULT               = 0x00001800,
     LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN = 0x00002800,
-    LTO_SYMBOL_COMDAT                      = 0x00004000
+    LTO_SYMBOL_COMDAT                      = 0x00004000,
+    LTO_SYMBOL_ALIAS                       = 0x00008000
 } lto_symbol_attributes;
 
 /**
diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h
index 958e3fdaea14..76615affb253 100644
--- a/include/llvm/ADT/APFloat.h
+++ b/include/llvm/ADT/APFloat.h
@@ -276,6 +276,10 @@ public:
   /// \param isIEEE   - If 128 bit number, select between PPC and IEEE
   static APFloat getAllOnesValue(unsigned BitWidth, bool isIEEE = false);
 
+  /// Returns the size of the floating point number (in bits) in the given
+  /// semantics.
+  static unsigned getSizeInBits(const fltSemantics &Sem);
+
   /// @}
 
   /// Used to insert APFloat objects, or objects that contain APFloat objects,
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 06f5870119c8..947812d94ecb 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -569,6 +569,22 @@ public:
   ///          architecture if no such variant can be found.
   llvm::Triple get64BitArchVariant() const;
 
+  /// Form a triple with a big endian variant of the current architecture.
+  ///
+  /// This can be used to move across "families" of architectures where useful.
+  ///
+  /// \returns A new triple with a big endian architecture or an unknown
+  ///          architecture if no such variant can be found.
+  llvm::Triple getBigEndianArchVariant() const;
+
+  /// Form a triple with a little endian variant of the current architecture.
+  ///
+  /// This can be used to move across "families" of architectures where useful.
+  ///
+  /// \returns A new triple with a little endian architecture or an unknown
+  ///          architecture if no such variant can be found.
+  llvm::Triple getLittleEndianArchVariant() const;
+
   /// Get the (LLVM) name of the minimum ARM CPU for the arch we are targeting.
   ///
   /// \param Arch the architecture name (e.g., "armv7s"). If it is an empty
diff --git a/include/llvm/ADT/edit_distance.h b/include/llvm/ADT/edit_distance.h
index c2b2041242aa..06a01b18a9fb 100644
--- a/include/llvm/ADT/edit_distance.h
+++ b/include/llvm/ADT/edit_distance.h
@@ -50,50 +50,51 @@ unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray,
   //   http://en.wikipedia.org/wiki/Levenshtein_distance
   //
   // Although the algorithm is typically described using an m x n
-  // array, only two rows are used at a time, so this implementation
-  // just keeps two separate vectors for those two rows.
+  // array, only one row plus one element are used at a time, so this
+  // implementation just keeps one vector for the row.  To update one entry,
+  // only the entries to the left, top, and top-left are needed.  The left
+  // entry is in Row[x-1], the top entry is what's in Row[x] from the last
+  // iteration, and the top-left entry is stored in Previous.
   typename ArrayRef<T>::size_type m = FromArray.size();
   typename ArrayRef<T>::size_type n = ToArray.size();
 
   const unsigned SmallBufferSize = 64;
   unsigned SmallBuffer[SmallBufferSize];
   std::unique_ptr<unsigned[]> Allocated;
-  unsigned *Previous = SmallBuffer;
-  if (2*(n + 1) > SmallBufferSize) {
-    Previous = new unsigned [2*(n+1)];
-    Allocated.reset(Previous);
+  unsigned *Row = SmallBuffer;
+  if (n + 1 > SmallBufferSize) {
+    Row = new unsigned[n + 1];
+    Allocated.reset(Row);
   }
-  unsigned *Current = Previous + (n + 1);
 
-  for (unsigned i = 0; i <= n; ++i)
-    Previous[i] = i;
+  for (unsigned i = 1; i <= n; ++i)
+    Row[i] = i;
 
   for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) {
-    Current[0] = y;
-    unsigned BestThisRow = Current[0];
+    Row[0] = y;
+    unsigned BestThisRow = Row[0];
 
+    unsigned Previous = y - 1;
     for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) {
+      int OldRow = Row[x];
       if (AllowReplacements) {
-        Current[x] = std::min(
-            Previous[x-1] + (FromArray[y-1] == ToArray[x-1] ? 0u : 1u),
-            std::min(Current[x-1], Previous[x])+1);
+        Row[x] = std::min(
+            Previous + (FromArray[y-1] == ToArray[x-1] ? 0u : 1u),
+            std::min(Row[x-1], Row[x])+1);
       }
       else {
-        if (FromArray[y-1] == ToArray[x-1]) Current[x] = Previous[x-1];
-        else Current[x] = std::min(Current[x-1], Previous[x]) + 1;
+        if (FromArray[y-1] == ToArray[x-1]) Row[x] = Previous;
+        else Row[x] = std::min(Row[x-1], Row[x]) + 1;
       }
-      BestThisRow = std::min(BestThisRow, Current[x]);
+      Previous = OldRow;
+      BestThisRow = std::min(BestThisRow, Row[x]);
     }
 
     if (MaxEditDistance && BestThisRow > MaxEditDistance)
       return MaxEditDistance + 1;
-
-    unsigned *tmp = Current;
-    Current = Previous;
-    Previous = tmp;
   }
 
-  unsigned Result = Previous[n];
+  unsigned Result = Row[n];
   return Result;
 }
 
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index f4c1167314a1..36f8199a0322 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -211,6 +211,8 @@ public:
     /// (if it has any) are non-volatile loads from objects pointed to by its
     /// pointer-typed arguments, with arbitrary offsets.
     ///
+    /// This property corresponds to the LLVM IR 'argmemonly' attribute combined
+    /// with 'readonly' attribute.
     /// This property corresponds to the IntrReadArgMem LLVM intrinsic flag.
     OnlyReadsArgumentPointees = ArgumentPointees | Ref,
 
@@ -218,6 +220,7 @@ public:
     /// function (if it has any) are non-volatile loads and stores from objects
     /// pointed to by its pointer-typed arguments, with arbitrary offsets.
     ///
+    /// This property corresponds to the LLVM IR 'argmemonly' attribute.
     /// This property corresponds to the IntrReadWriteArgMem LLVM intrinsic flag.
     OnlyAccessesArgumentPointees = ArgumentPointees | ModRef,
 
@@ -518,14 +521,6 @@ public:
   ///
   virtual void deleteValue(Value *V);
 
-  /// copyValue - This method should be used whenever a preexisting value in the
-  /// program is copied or cloned, introducing a new value.  Note that analysis
-  /// implementations should tolerate clients that use this method to introduce
-  /// the same value multiple times: if the analysis already knows about a
-  /// value, it should ignore the request.
-  ///
-  virtual void copyValue(Value *From, Value *To);
-
   /// addEscapingUse - This method should be used whenever an escaping use is
   /// added to a pointer value.  Analysis implementations may either return
   /// conservative responses for that value in the future, or may recompute
@@ -541,7 +536,6 @@ public:
   /// above, and it provided as a helper to simplify client code.
   ///
   void replaceWithNewValue(Value *Old, Value *New) {
-    copyValue(Old, New);
     deleteValue(Old);
   }
 };
diff --git a/include/llvm/Analysis/ConstantFolding.h b/include/llvm/Analysis/ConstantFolding.h
index 541a2109af6c..e8185b3b6307 100644
--- a/include/llvm/Analysis/ConstantFolding.h
+++ b/include/llvm/Analysis/ConstantFolding.h
@@ -72,6 +72,17 @@ namespace llvm {
 Constant *ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val,
                                              ArrayRef<unsigned> Idxs);
 
+/// \brief Attempt to constant fold an extractvalue instruction with the
+/// specified operands and indices.  The constant result is returned if
+/// successful; if not, null is returned.
+Constant *ConstantFoldExtractValueInstruction(Constant *Agg,
+                                              ArrayRef<unsigned> Idxs);
+
+/// \brief Attempt to constant fold an extractelement instruction with the
+/// specified operands and indices.  The constant result is returned if
+/// successful; if not, null is returned.
+Constant *ConstantFoldExtractElementInstruction(Constant *Val, Constant *Idx);
+
 /// ConstantFoldLoadFromConstPtr - Return the value that a load from C would
 /// produce if it is constant and determinable.  If this is not determinable,
 /// return null.
diff --git a/include/llvm/Analysis/DominanceFrontier.h b/include/llvm/Analysis/DominanceFrontier.h
index 996700efdb60..fb730054a8e5 100644
--- a/include/llvm/Analysis/DominanceFrontier.h
+++ b/include/llvm/Analysis/DominanceFrontier.h
@@ -202,8 +202,8 @@ public:
   void dump() const;
 };
 
-EXTERN_TEMPLATE_INSTANTIATION(class DominanceFrontierBase<BasicBlock>);
-EXTERN_TEMPLATE_INSTANTIATION(class ForwardDominanceFrontierBase<BasicBlock>);
+extern template class DominanceFrontierBase<BasicBlock>;
+extern template class ForwardDominanceFrontierBase<BasicBlock>;
 
 } // End llvm namespace
 
diff --git a/include/llvm/Analysis/IVUsers.h b/include/llvm/Analysis/IVUsers.h
index ae9c1f5bd9ac..00dbcbdd7806 100644
--- a/include/llvm/Analysis/IVUsers.h
+++ b/include/llvm/Analysis/IVUsers.h
@@ -21,6 +21,7 @@
 
 namespace llvm {
 
+class AssumptionCache;
 class DominatorTree;
 class Instruction;
 class Value;
@@ -119,15 +120,19 @@ private:
 class IVUsers : public LoopPass {
   friend class IVStrideUse;
   Loop *L;
+  AssumptionCache *AC;
   LoopInfo *LI;
   DominatorTree *DT;
   ScalarEvolution *SE;
-  SmallPtrSet<Instruction*,16> Processed;
+  SmallPtrSet<Instruction*, 16> Processed;
 
   /// IVUses - A list of all tracked IV uses of induction variable expressions
   /// we are interested in.
   ilist<IVStrideUse> IVUses;
 
+  // Ephemeral values used by @llvm.assume in this function.
+  SmallPtrSet<const Value *, 32> EphValues;
+
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
   bool runOnLoop(Loop *L, LPPassManager &LPM) override;
diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h
index 706bd8000d3a..d44c5ff4078d 100644
--- a/include/llvm/Analysis/InstructionSimplify.h
+++ b/include/llvm/Analysis/InstructionSimplify.h
@@ -212,7 +212,7 @@ namespace llvm {
   /// SimplifyFCmpInst - Given operands for an FCmpInst, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                          const DataLayout &DL,
+                          FastMathFlags FMF, const DataLayout &DL,
                           const TargetLibraryInfo *TLI = nullptr,
                           const DominatorTree *DT = nullptr,
                           AssumptionCache *AC = nullptr,
@@ -244,6 +244,24 @@ namespace llvm {
                                  AssumptionCache *AC = nullptr,
                                  const Instruction *CxtI = nullptr);
 
+  /// \brief Given operands for an ExtractValueInst, see if we can fold the
+  /// result.  If not, this returns null.
+  Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
+                                  const DataLayout &DL,
+                                  const TargetLibraryInfo *TLI = nullptr,
+                                  const DominatorTree *DT = nullptr,
+                                  AssumptionCache *AC = nullptr,
+                                  const Instruction *CxtI = nullptr);
+
+  /// \brief Given operands for an ExtractElementInst, see if we can fold the
+  /// result.  If not, this returns null.
+  Value *SimplifyExtractElementInst(Value *Vec, Value *Idx,
+                                    const DataLayout &DL,
+                                    const TargetLibraryInfo *TLI = nullptr,
+                                    const DominatorTree *DT = nullptr,
+                                    AssumptionCache *AC = nullptr,
+                                    const Instruction *CxtI = nullptr);
+
   /// SimplifyTruncInst - Given operands for an TruncInst, see if we can fold
   /// the result.  If not, this returns null.
   Value *SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout &DL,
diff --git a/include/llvm/Analysis/JumpInstrTableInfo.h b/include/llvm/Analysis/JumpInstrTableInfo.h
deleted file mode 100644
index b6dad478cdf2..000000000000
--- a/include/llvm/Analysis/JumpInstrTableInfo.h
+++ /dev/null
@@ -1,71 +0,0 @@
-//===-- JumpInstrTableInfo.h: Info for Jump-Instruction Tables --*- C++ -*-===//
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief Information about jump-instruction tables that have been created by
-/// JumpInstrTables pass.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_JUMPINSTRTABLEINFO_H
-#define LLVM_ANALYSIS_JUMPINSTRTABLEINFO_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Pass.h"
-#include <vector>
-
-namespace llvm {
-class Function;
-class FunctionType;
-
-/// This class stores information about jump-instruction tables created by the
-/// JumpInstrTables pass (in lib/CodeGen/JumpInstrTables.cpp). Each table is a
-/// map from a function type to a vector of pairs. The first element of each
-/// pair is the function that has the jumptable annotation. The second element
-/// is a function that was declared by JumpInstrTables and used to replace all
-/// address-taking sites for the original function.
-///
-/// The information in this pass is used in AsmPrinter
-/// (lib/CodeGen/AsmPrinter/AsmPrinter.cpp) to generate the required assembly
-/// for the jump-instruction tables.
-class JumpInstrTableInfo : public ImmutablePass {
-public:
-  static char ID;
-
-  /// The default byte alignment for jump tables is 16, which is large but
-  /// usually safe.
-  JumpInstrTableInfo(uint64_t ByteAlign = 16);
-  ~JumpInstrTableInfo() override;
-  const char *getPassName() const override {
-    return "Jump-Instruction Table Info";
-  }
-
-  typedef std::pair<Function *, Function *> JumpPair;
-  typedef DenseMap<FunctionType *, std::vector<JumpPair> > JumpTables;
-
-  /// Inserts an entry in a table, adding the table if it doesn't exist.
-  void insertEntry(FunctionType *TableFunTy, Function *Target, Function *Jump);
-
-  /// Gets the tables.
-  const JumpTables &getTables() const { return Tables; }
-
-  /// Gets the alignment in bytes of a jumptable entry.
-  uint64_t entryByteAlignment() const { return ByteAlignment; }
-private:
-  JumpTables Tables;
-
-  /// A power-of-two alignment of a jumptable entry.
-  uint64_t ByteAlignment;
-};
-
-/// Creates a JumpInstrTableInfo pass with the given bound on entry size. This
-/// bound specifies the maximum number of bytes needed to represent an
-/// unconditional jump or a trap instruction in the back end currently in use.
-ModulePass *createJumpInstrTableInfoPass(unsigned Bound);
-}
-
-#endif /* LLVM_ANALYSIS_JUMPINSTRTABLEINFO_H */
diff --git a/include/llvm/Analysis/LibCallSemantics.h b/include/llvm/Analysis/LibCallSemantics.h
index 170e2a49a8ea..b4bef310e590 100644
--- a/include/llvm/Analysis/LibCallSemantics.h
+++ b/include/llvm/Analysis/LibCallSemantics.h
@@ -206,6 +206,18 @@ class InvokeInst;
     llvm_unreachable("invalid enum");
   }
 
+  /// \brief Return true if this personality may be safely removed if there
+  /// are no invoke instructions remaining in the current function.
+  inline bool isNoOpWithoutInvoke(EHPersonality Pers) {
+    switch (Pers) {
+    case EHPersonality::Unknown:
+      return false;
+    // All known personalities currently have this behavior
+    default: return true;
+    }
+    llvm_unreachable("invalid enum");
+  }
+
   bool canSimplifyInvokeNoUnwind(const Function *F);
 
 } // end namespace llvm
diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 7b635a8b4960..476e4b6686bb 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -292,6 +292,133 @@ private:
   bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
 };
 
+/// \brief Holds information about the memory runtime legality checks to verify
+/// that a group of pointers do not overlap.
+class RuntimePointerChecking {
+public:
+  struct PointerInfo {
+    /// Holds the pointer value that we need to check.
+    TrackingVH<Value> PointerValue;
+    /// Holds the pointer value at the beginning of the loop.
+    const SCEV *Start;
+    /// Holds the pointer value at the end of the loop.
+    const SCEV *End;
+    /// Holds the information if this pointer is used for writing to memory.
+    bool IsWritePtr;
+    /// Holds the id of the set of pointers that could be dependent because of a
+    /// shared underlying object.
+    unsigned DependencySetId;
+    /// Holds the id of the disjoint alias set to which this pointer belongs.
+    unsigned AliasSetId;
+    /// SCEV for the access.
+    const SCEV *Expr;
+
+    PointerInfo(Value *PointerValue, const SCEV *Start, const SCEV *End,
+                bool IsWritePtr, unsigned DependencySetId, unsigned AliasSetId,
+                const SCEV *Expr)
+        : PointerValue(PointerValue), Start(Start), End(End),
+          IsWritePtr(IsWritePtr), DependencySetId(DependencySetId),
+          AliasSetId(AliasSetId), Expr(Expr) {}
+  };
+
+  RuntimePointerChecking(ScalarEvolution *SE) : Need(false), SE(SE) {}
+
+  /// Reset the state of the pointer runtime information.
+  void reset() {
+    Need = false;
+    Pointers.clear();
+  }
+
+  /// Insert a pointer and calculate the start and end SCEVs.
+  void insert(Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
+              unsigned ASId, const ValueToValueMap &Strides);
+
+  /// \brief No run-time memory checking is necessary.
+  bool empty() const { return Pointers.empty(); }
+
+  /// A grouping of pointers. A single memcheck is required between
+  /// two groups.
+  struct CheckingPtrGroup {
+    /// \brief Create a new pointer checking group containing a single
+    /// pointer, with index \p Index in RtCheck.
+    CheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck)
+        : RtCheck(RtCheck), High(RtCheck.Pointers[Index].End),
+          Low(RtCheck.Pointers[Index].Start) {
+      Members.push_back(Index);
+    }
+
+    /// \brief Tries to add the pointer recorded in RtCheck at index
+    /// \p Index to this pointer checking group. We can only add a pointer
+    /// to a checking group if we will still be able to get
+    /// the upper and lower bounds of the check. Returns true in case
+    /// of success, false otherwise.
+    bool addPointer(unsigned Index);
+
+    /// Constitutes the context of this pointer checking group. For each
+    /// pointer that is a member of this group we will retain the index
+    /// at which it appears in RtCheck.
+    RuntimePointerChecking &RtCheck;
+    /// The SCEV expression which represents the upper bound of all the
+    /// pointers in this group.
+    const SCEV *High;
+    /// The SCEV expression which represents the lower bound of all the
+    /// pointers in this group.
+    const SCEV *Low;
+    /// Indices of all the pointers that constitute this grouping.
+    SmallVector<unsigned, 2> Members;
+  };
+
+  /// \brief Groups pointers such that a single memcheck is required
+  /// between two different groups. This will clear the CheckingGroups vector
+  /// and re-compute it. We will only group dependecies if \p UseDependencies
+  /// is true, otherwise we will create a separate group for each pointer.
+  void groupChecks(MemoryDepChecker::DepCandidates &DepCands,
+                   bool UseDependencies);
+
+  /// \brief Decide if we need to add a check between two groups of pointers,
+  /// according to needsChecking.
+  bool needsChecking(const CheckingPtrGroup &M, const CheckingPtrGroup &N,
+                     const SmallVectorImpl<int> *PtrPartition) const;
+
+  /// \brief Return true if any pointer requires run-time checking according
+  /// to needsChecking.
+  bool needsAnyChecking(const SmallVectorImpl<int> *PtrPartition) const;
+
+  /// \brief Returns the number of run-time checks required according to
+  /// needsChecking.
+  unsigned getNumberOfChecks(const SmallVectorImpl<int> *PtrPartition) const;
+
+  /// \brief Print the list run-time memory checks necessary.
+  ///
+  /// If \p PtrPartition is set, it contains the partition number for
+  /// pointers (-1 if the pointer belongs to multiple partitions).  In this
+  /// case omit checks between pointers belonging to the same partition.
+  void print(raw_ostream &OS, unsigned Depth = 0,
+             const SmallVectorImpl<int> *PtrPartition = nullptr) const;
+
+  /// This flag indicates if we need to add the runtime check.
+  bool Need;
+
+  /// Information about the pointers that may require checking.
+  SmallVector<PointerInfo, 2> Pointers;
+
+  /// Holds a partitioning of pointers into "check groups".
+  SmallVector<CheckingPtrGroup, 2> CheckingGroups;
+
+private:
+  /// \brief Decide whether we need to issue a run-time check for pointer at
+  /// index \p I and \p J to prove their independence.
+  ///
+  /// If \p PtrPartition is set, it contains the partition number for
+  /// pointers (-1 if the pointer belongs to multiple partitions).  In this
+  /// case omit checks between pointers belonging to the same partition.
+  bool needsChecking(unsigned I, unsigned J,
+                     const SmallVectorImpl<int> *PtrPartition) const;
+
+  /// Holds a pointer to the ScalarEvolution analysis.
+  ScalarEvolution *SE;
+};
+
 /// \brief Drive the analysis of memory accesses in the loop
 ///
 /// This class is responsible for analyzing the memory accesses of a loop.  It
@@ -308,72 +435,6 @@ private:
 /// RuntimePointerCheck class.
 class LoopAccessInfo {
 public:
-  /// This struct holds information about the memory runtime legality check that
-  /// a group of pointers do not overlap.
-  struct RuntimePointerCheck {
-    RuntimePointerCheck() : Need(false) {}
-
-    /// Reset the state of the pointer runtime information.
-    void reset() {
-      Need = false;
-      Pointers.clear();
-      Starts.clear();
-      Ends.clear();
-      IsWritePtr.clear();
-      DependencySetId.clear();
-      AliasSetId.clear();
-    }
-
-    /// Insert a pointer and calculate the start and end SCEVs.
-    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
-                unsigned DepSetId, unsigned ASId,
-                const ValueToValueMap &Strides);
-
-    /// \brief No run-time memory checking is necessary.
-    bool empty() const { return Pointers.empty(); }
-
-    /// \brief Decide whether we need to issue a run-time check for pointer at
-    /// index \p I and \p J to prove their independence.
-    ///
-    /// If \p PtrPartition is set, it contains the partition number for
-    /// pointers (-1 if the pointer belongs to multiple partitions).  In this
-    /// case omit checks between pointers belonging to the same partition.
-    bool needsChecking(unsigned I, unsigned J,
-                       const SmallVectorImpl<int> *PtrPartition) const;
-
-    /// \brief Return true if any pointer requires run-time checking according
-    /// to needsChecking.
-    bool needsAnyChecking(const SmallVectorImpl<int> *PtrPartition) const;
-
-    /// \brief Returns the number of run-time checks required according to
-    /// needsChecking.
-    unsigned getNumberOfChecks(const SmallVectorImpl<int> *PtrPartition) const;
-
-    /// \brief Print the list run-time memory checks necessary.
-    ///
-    /// If \p PtrPartition is set, it contains the partition number for
-    /// pointers (-1 if the pointer belongs to multiple partitions).  In this
-    /// case omit checks between pointers belonging to the same partition.
-    void print(raw_ostream &OS, unsigned Depth = 0,
-               const SmallVectorImpl<int> *PtrPartition = nullptr) const;
-
-    /// This flag indicates if we need to add the runtime check.
-    bool Need;
-    /// Holds the pointers that we need to check.
-    SmallVector<TrackingVH<Value>, 2> Pointers;
-    /// Holds the pointer value at the beginning of the loop.
-    SmallVector<const SCEV*, 2> Starts;
-    /// Holds the pointer value at the end of the loop.
-    SmallVector<const SCEV*, 2> Ends;
-    /// Holds the information if this pointer is used for writing to memory.
-    SmallVector<bool, 2> IsWritePtr;
-    /// Holds the id of the set of pointers that could be dependent because of a
-    /// shared underlying object.
-    SmallVector<unsigned, 2> DependencySetId;
-    /// Holds the id of the disjoint alias set to which this pointer belongs.
-    SmallVector<unsigned, 2> AliasSetId;
-  };
-
   LoopAccessInfo(Loop *L, ScalarEvolution *SE, const DataLayout &DL,
                  const TargetLibraryInfo *TLI, AliasAnalysis *AA,
                  DominatorTree *DT, LoopInfo *LI,
@@ -383,15 +444,15 @@ public:
   /// no memory dependence cycles.
   bool canVectorizeMemory() const { return CanVecMem; }
 
-  const RuntimePointerCheck *getRuntimePointerCheck() const {
-    return &PtrRtCheck;
+  const RuntimePointerChecking *getRuntimePointerChecking() const {
+    return &PtrRtChecking;
   }
 
   /// \brief Number of memchecks required to prove independence of otherwise
   /// may-alias pointers.
   unsigned getNumRuntimePointerChecks(
     const SmallVectorImpl<int> *PtrPartition = nullptr) const {
-    return PtrRtCheck.getNumberOfChecks(PtrPartition);
+    return PtrRtChecking.getNumberOfChecks(PtrPartition);
   }
 
   /// Return true if the block BB needs to be predicated in order for the loop
@@ -461,7 +522,7 @@ private:
 
   /// We need to check that all of the pointers in this list are disjoint
   /// at runtime.
-  RuntimePointerCheck PtrRtCheck;
+  RuntimePointerChecking PtrRtChecking;
 
   /// \brief the Memory Dependence Checker which can determine the
   /// loop-independent and loop-carried dependences between memory accesses.
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index bbcde8d9721a..3ec83f2c21fd 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -347,9 +347,7 @@ raw_ostream& operator<<(raw_ostream &OS, const LoopBase<BlockT, LoopT> &Loop) {
 }
 
 // Implementation in LoopInfoImpl.h
-#ifdef __GNUC__
-__extension__ extern template class LoopBase<BasicBlock, Loop>;
-#endif
+extern template class LoopBase<BasicBlock, Loop>;
 
 class Loop : public LoopBase<BasicBlock, Loop> {
 public:
@@ -633,9 +631,7 @@ public:
 };
 
 // Implementation in LoopInfoImpl.h
-#ifdef __GNUC__
-__extension__ extern template class LoopInfoBase<BasicBlock, Loop>;
-#endif
+extern template class LoopInfoBase<BasicBlock, Loop>;
 
 class LoopInfo : public LoopInfoBase<BasicBlock, Loop> {
   typedef LoopInfoBase<BasicBlock, Loop> BaseT;
diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h
index 7ceb086ee0a1..8560f1f67160 100644
--- a/include/llvm/Analysis/RegionInfo.h
+++ b/include/llvm/Analysis/RegionInfo.h
@@ -902,9 +902,9 @@ inline raw_ostream &operator<<(raw_ostream &OS,
     return OS << Node.template getNodeAs<BlockT>()->getName();
 }
 
-EXTERN_TEMPLATE_INSTANTIATION(class RegionBase<RegionTraits<Function>>);
-EXTERN_TEMPLATE_INSTANTIATION(class RegionNodeBase<RegionTraits<Function>>);
-EXTERN_TEMPLATE_INSTANTIATION(class RegionInfoBase<RegionTraits<Function>>);
+extern template class RegionBase<RegionTraits<Function>>;
+extern template class RegionNodeBase<RegionTraits<Function>>;
+extern template class RegionInfoBase<RegionTraits<Function>>;
 
 } // End llvm namespace
 #endif
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index bb6e266b1f5b..01f00896410e 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -69,7 +69,7 @@ public:
   ///
   /// The TTI implementation will reflect the information in the DataLayout
   /// provided if non-null.
-  explicit TargetTransformInfo(const DataLayout *DL);
+  explicit TargetTransformInfo(const DataLayout &DL);
 
   // Provide move semantics.
   TargetTransformInfo(TargetTransformInfo &&Arg);
@@ -541,7 +541,7 @@ private:
 class TargetTransformInfo::Concept {
 public:
   virtual ~Concept() = 0;
-
+  virtual const DataLayout &getDataLayout() const = 0;
   virtual unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) = 0;
   virtual unsigned getGEPCost(const Value *Ptr,
                               ArrayRef<const Value *> Operands) = 0;
@@ -636,6 +636,10 @@ public:
   Model(T Impl) : Impl(std::move(Impl)) {}
   ~Model() override {}
 
+  const DataLayout &getDataLayout() const override {
+    return Impl.getDataLayout();
+  }
+
   unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) override {
     return Impl.getOperationCost(Opcode, Ty, OpTy);
   }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 403175acae02..035cb04870a1 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -30,26 +30,17 @@ class TargetTransformInfoImplBase {
 protected:
   typedef TargetTransformInfo TTI;
 
-  const DataLayout *DL;
+  const DataLayout &DL;
 
-  explicit TargetTransformInfoImplBase(const DataLayout *DL)
-      : DL(DL) {}
+  explicit TargetTransformInfoImplBase(const DataLayout &DL) : DL(DL) {}
 
 public:
   // Provide value semantics. MSVC requires that we spell all of these out.
   TargetTransformInfoImplBase(const TargetTransformInfoImplBase &Arg)
       : DL(Arg.DL) {}
-  TargetTransformInfoImplBase(TargetTransformInfoImplBase &&Arg)
-      : DL(std::move(Arg.DL)) {}
-  TargetTransformInfoImplBase &
-  operator=(const TargetTransformInfoImplBase &RHS) {
-    DL = RHS.DL;
-    return *this;
-  }
-  TargetTransformInfoImplBase &operator=(TargetTransformInfoImplBase &&RHS) {
-    DL = std::move(RHS.DL);
-    return *this;
-  }
+  TargetTransformInfoImplBase(TargetTransformInfoImplBase &&Arg) : DL(Arg.DL) {}
+
+  const DataLayout &getDataLayout() const { return DL; }
 
   unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) {
     switch (Opcode) {
@@ -70,28 +61,22 @@ public:
       return TTI::TCC_Basic;
 
     case Instruction::IntToPtr: {
-      if (!DL)
-        return TTI::TCC_Basic;
-
       // An inttoptr cast is free so long as the input is a legal integer type
       // which doesn't contain values outside the range of a pointer.
       unsigned OpSize = OpTy->getScalarSizeInBits();
-      if (DL->isLegalInteger(OpSize) &&
-          OpSize <= DL->getPointerTypeSizeInBits(Ty))
+      if (DL.isLegalInteger(OpSize) &&
+          OpSize <= DL.getPointerTypeSizeInBits(Ty))
         return TTI::TCC_Free;
 
       // Otherwise it's not a no-op.
       return TTI::TCC_Basic;
     }
     case Instruction::PtrToInt: {
-      if (!DL)
-        return TTI::TCC_Basic;
-
       // A ptrtoint cast is free so long as the result is large enough to store
       // the pointer, and a legal integer type.
       unsigned DestSize = Ty->getScalarSizeInBits();
-      if (DL->isLegalInteger(DestSize) &&
-          DestSize >= DL->getPointerTypeSizeInBits(OpTy))
+      if (DL.isLegalInteger(DestSize) &&
+          DestSize >= DL.getPointerTypeSizeInBits(OpTy))
         return TTI::TCC_Free;
 
       // Otherwise it's not a no-op.
@@ -100,7 +85,7 @@ public:
     case Instruction::Trunc:
       // trunc to a native type is free (assuming the target has compare and
       // shift-right of the same width).
-      if (DL && DL->isLegalInteger(DL->getTypeSizeInBits(Ty)))
+      if (DL.isLegalInteger(DL.getTypeSizeInBits(Ty)))
         return TTI::TCC_Free;
 
       return TTI::TCC_Basic;
@@ -353,8 +338,7 @@ private:
   typedef TargetTransformInfoImplBase BaseT;
 
 protected:
-  explicit TargetTransformInfoImplCRTPBase(const DataLayout *DL)
-      : BaseT(DL) {}
+  explicit TargetTransformInfoImplCRTPBase(const DataLayout &DL) : BaseT(DL) {}
 
 public:
   // Provide value semantics. MSVC requires that we spell all of these out.
@@ -362,16 +346,6 @@ public:
       : BaseT(static_cast<const BaseT &>(Arg)) {}
   TargetTransformInfoImplCRTPBase(TargetTransformInfoImplCRTPBase &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))) {}
-  TargetTransformInfoImplCRTPBase &
-  operator=(const TargetTransformInfoImplCRTPBase &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    return *this;
-  }
-  TargetTransformInfoImplCRTPBase &
-  operator=(TargetTransformInfoImplCRTPBase &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    return *this;
-  }
 
   using BaseT::getCallCost;
 
diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index aa538ecc0137..d8e9ca42e623 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -20,6 +20,12 @@
 
 namespace llvm {
 
+class GetElementPtrInst;
+class Loop;
+class ScalarEvolution;
+class Type;
+class Value;
+
 /// \brief Identify if the intrinsic is trivially vectorizable.
 /// This method returns true if the intrinsic's argument types are all
 /// scalars for the scalar form of the intrinsic and all vectors for
@@ -51,6 +57,28 @@ Intrinsic::ID checkBinaryFloatSignature(const CallInst &I,
 /// its intrinsic ID, in case it does not found it return not_intrinsic.
 Intrinsic::ID getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI);
 
+/// \brief Find the operand of the GEP that should be checked for consecutive
+/// stores. This ignores trailing indices that have no effect on the final
+/// pointer.
+unsigned getGEPInductionOperand(const GetElementPtrInst *Gep);
+
+/// \brief If the argument is a GEP, then returns the operand identified by 
+/// getGEPInductionOperand. However, if there is some other non-loop-invariant 
+/// operand, it returns that instead.
+Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp);
+
+/// \brief If a value has only one user that is a CastInst, return it.
+Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty);
+
+/// \brief Get the stride of a pointer access in a loop. Looks for symbolic
+/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
+Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp);
+
+/// \brief Given a vector and an element number, see if the scalar value is
+/// already around as a register, for example if it were inserted then extracted
+/// from the vector.
+Value *findScalarElement(Value *V, unsigned EltNo);
+
 } // llvm namespace
 
 #endif
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index 605c4172dd87..7130ee755237 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -407,6 +407,7 @@ namespace bitc {
     ATTR_KIND_DEREFERENCEABLE_OR_NULL = 42,
     ATTR_KIND_CONVERGENT = 43,
     ATTR_KIND_SAFESTACK = 44,
+    ATTR_KIND_ARGMEMONLY = 45
   };
 
   enum ComdatSelectionKindCodes {
diff --git a/include/llvm/Bitcode/ReaderWriter.h b/include/llvm/Bitcode/ReaderWriter.h
index 6797aa133c42..452ec3bd0187 100644
--- a/include/llvm/Bitcode/ReaderWriter.h
+++ b/include/llvm/Bitcode/ReaderWriter.h
@@ -146,7 +146,7 @@ namespace llvm {
   }
 
   const std::error_category &BitcodeErrorCategory();
-  enum class BitcodeError { InvalidBitcodeSignature, CorruptedBitcode };
+  enum class BitcodeError { InvalidBitcodeSignature = 1, CorruptedBitcode };
   inline std::error_code make_error_code(BitcodeError E) {
     return std::error_code(static_cast<int>(E), BitcodeErrorCategory());
   }
diff --git a/include/llvm/CodeGen/Analysis.h b/include/llvm/CodeGen/Analysis.h
index c4b94ede4f55..82d1e8ada17d 100644
--- a/include/llvm/CodeGen/Analysis.h
+++ b/include/llvm/CodeGen/Analysis.h
@@ -64,7 +64,7 @@ inline unsigned ComputeLinearIndex(Type *Ty,
 /// If Offsets is non-null, it points to a vector to be filled in
 /// with the in-memory offsets of each of the individual values.
 ///
-void ComputeValueVTs(const TargetLowering &TLI, Type *Ty,
+void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty,
                      SmallVectorImpl<EVT> &ValueVTs,
                      SmallVectorImpl<uint64_t> *Offsets = nullptr,
                      uint64_t StartingOffset = 0);
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index 3e464f4f1e5a..9ba25169fda6 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -91,8 +91,10 @@ private:
   }
 
 protected:
-  explicit BasicTTIImplBase(const TargetMachine *TM)
-      : BaseT(TM->getDataLayout()) {}
+  explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
+      : BaseT(DL) {}
+
+  using TargetTransformInfoImplBase::DL;
 
 public:
   // Provide value semantics. MSVC requires that we spell all of these out.
@@ -100,14 +102,6 @@ public:
       : BaseT(static_cast<const BaseT &>(Arg)) {}
   BasicTTIImplBase(BasicTTIImplBase &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))) {}
-  BasicTTIImplBase &operator=(const BasicTTIImplBase &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    return *this;
-  }
-  BasicTTIImplBase &operator=(BasicTTIImplBase &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    return *this;
-  }
 
   /// \name Scalar TTI Implementations
   /// @{
@@ -132,7 +126,7 @@ public:
     AM.BaseOffs = BaseOffset;
     AM.HasBaseReg = HasBaseReg;
     AM.Scale = Scale;
-    return getTLI()->isLegalAddressingMode(AM, Ty, AddrSpace);
+    return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace);
   }
 
   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
@@ -142,7 +136,7 @@ public:
     AM.BaseOffs = BaseOffset;
     AM.HasBaseReg = HasBaseReg;
     AM.Scale = Scale;
-    return getTLI()->getScalingFactorCost(AM, Ty, AddrSpace);
+    return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
   }
 
   bool isTruncateFree(Type *Ty1, Type *Ty2) {
@@ -154,7 +148,7 @@ public:
   }
 
   bool isTypeLegal(Type *Ty) {
-    EVT VT = getTLI()->getValueType(Ty);
+    EVT VT = getTLI()->getValueType(DL, Ty);
     return getTLI()->isTypeLegal(VT);
   }
 
@@ -192,7 +186,7 @@ public:
 
   bool haveFastSqrt(Type *Ty) {
     const TargetLoweringBase *TLI = getTLI();
-    EVT VT = TLI->getValueType(Ty);
+    EVT VT = TLI->getValueType(DL, Ty);
     return TLI->isTypeLegal(VT) &&
            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
   }
@@ -282,7 +276,7 @@ public:
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector) { return 1; }
+  unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; }
 
   unsigned getRegisterBitWidth(bool Vector) { return 32; }
 
@@ -299,7 +293,7 @@ public:
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
     assert(ISD && "Invalid opcode");
 
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
     bool IsFloat = Ty->getScalarType()->isFloatingPointTy();
     // Assume that floating point arithmetic operations cost twice as much as
@@ -349,9 +343,8 @@ public:
     const TargetLoweringBase *TLI = getTLI();
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
     assert(ISD && "Invalid opcode");
-
-    std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(Src);
-    std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(Dst);
+    std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src);
+    std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst);
 
     // Check for NOOP conversions.
     if (SrcLT.first == DstLT.first &&
@@ -455,8 +448,7 @@ public:
       if (CondTy->isVectorTy())
         ISD = ISD::VSELECT;
     }
-
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
         !TLI->isOperationExpand(ISD, LT.second)) {
@@ -485,7 +477,7 @@ public:
 
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
     std::pair<unsigned, MVT> LT =
-        getTLI()->getTypeLegalizationCost(Val->getScalarType());
+        getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
 
     return LT.first;
   }
@@ -493,7 +485,7 @@ public:
   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                            unsigned AddressSpace) {
     assert(!Src->isVoidTy() && "Invalid type");
-    std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(Src);
+    std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src);
 
     // Assuming that all loads of legal types cost 1.
     unsigned Cost = LT.first;
@@ -504,7 +496,7 @@ public:
       // itself. Unless the corresponding extending load or truncating store is
       // legal, then this will scalarize.
       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
-      EVT MemVT = getTLI()->getValueType(Src, true);
+      EVT MemVT = getTLI()->getValueType(DL, Src, true);
       if (MemVT.isSimple() && MemVT != MVT::Other) {
         if (Opcode == Instruction::Store)
           LA = getTLI()->getTruncStoreAction(LT.second, MemVT.getSimpleVT());
@@ -700,7 +692,7 @@ public:
     }
 
     const TargetLoweringBase *TLI = getTLI();
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(RetTy);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
 
     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
       // The operation is legal. Assume it costs 1.
@@ -771,7 +763,7 @@ public:
   }
 
   unsigned getNumberOfParts(Type *Tp) {
-    std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(Tp);
+    std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp);
     return LT.first;
   }
 
@@ -816,18 +808,6 @@ public:
   BasicTTIImpl(BasicTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  BasicTTIImpl &operator=(const BasicTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  BasicTTIImpl &operator=(BasicTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 };
 
 }
diff --git a/include/llvm/CodeGen/CommandFlags.h b/include/llvm/CodeGen/CommandFlags.h
index 554511d6f4ab..4b2e0b06584c 100644
--- a/include/llvm/CodeGen/CommandFlags.h
+++ b/include/llvm/CodeGen/CommandFlags.h
@@ -206,6 +206,10 @@ cl::opt<std::string> StartAfter("start-after",
                           cl::value_desc("pass-name"),
                           cl::init(""));
 
+cl::opt<std::string>
+    RunPass("run-pass", cl::desc("Run compiler only for one specific pass"),
+            cl::value_desc("pass-name"), cl::init(""));
+
 cl::opt<bool> DataSections("data-sections",
                            cl::desc("Emit data into separate sections"),
                            cl::init(false));
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index c7237fd55b27..fa44301a2d4a 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -72,10 +72,13 @@ namespace ISD {
     /// the parent's frame or return address, and so on.
     FRAMEADDR, RETURNADDR,
 
-    /// FRAME_ALLOC_RECOVER - Represents the llvm.framerecover
-    /// intrinsic. Materializes the offset from the frame pointer of another
-    /// function to the result of llvm.frameallocate.
-    FRAME_ALLOC_RECOVER,
+    /// LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
+    /// Materializes the offset from the local object pointer of another
+    /// function to a particular local object passed to llvm.localescape. The
+    /// operand is the MCSymbol label used to represent this offset, since
+    /// typically the offset is not known until after code generation of the
+    /// parent.
+    LOCAL_RECOVER,
 
     /// READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on
     /// the DAG, which implements the named register global variables extension.
@@ -725,7 +728,7 @@ namespace ISD {
   /// which do not reference a specific memory location should be less than
   /// this value. Those that do must not be less than this value, and can
   /// be used with SelectionDAG::getMemIntrinsicNode.
-  static const int FIRST_TARGET_MEMORY_OPCODE = BUILTIN_OP_END+200;
+  static const int FIRST_TARGET_MEMORY_OPCODE = BUILTIN_OP_END+300;
 
   //===--------------------------------------------------------------------===//
   /// MemIndexedMode enum - This enum defines the load / store indexed
diff --git a/include/llvm/CodeGen/LiveIntervalUnion.h b/include/llvm/CodeGen/LiveIntervalUnion.h
index 967f0cbac719..f0f1637dc92d 100644
--- a/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -203,6 +203,11 @@ public:
       assert(idx <  Size && "idx out of bounds");
       return LIUs[idx];
     }
+
+    const LiveIntervalUnion& operator[](unsigned Idx) const {
+      assert(Idx < Size && "Idx out of bounds");
+      return LIUs[Idx];
+    }
   };
 };
 
diff --git a/include/llvm/CodeGen/LiveRegMatrix.h b/include/llvm/CodeGen/LiveRegMatrix.h
index 878b4d9836f2..e169058ca563 100644
--- a/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/include/llvm/CodeGen/LiveRegMatrix.h
@@ -32,13 +32,11 @@ namespace llvm {
 
 class LiveInterval;
 class LiveIntervalAnalysis;
-class MachineRegisterInfo;
 class TargetRegisterInfo;
 class VirtRegMap;
 
 class LiveRegMatrix : public MachineFunctionPass {
   const TargetRegisterInfo *TRI;
-  MachineRegisterInfo *MRI;
   LiveIntervals *LIS;
   VirtRegMap *VRM;
 
@@ -114,6 +112,9 @@ public:
   /// the assignment and updates VirtRegMap accordingly.
   void unassign(LiveInterval &VirtReg);
 
+  /// Returns true if the given \p PhysReg has any live intervals assigned.
+  bool isPhysRegUsed(unsigned PhysReg) const;
+
   //===--------------------------------------------------------------------===//
   // Low-level interface.
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/CodeGen/MIRYamlMapping.h b/include/llvm/CodeGen/MIRYamlMapping.h
index a6ffeb382978..9798e5cef645 100644
--- a/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/include/llvm/CodeGen/MIRYamlMapping.h
@@ -81,15 +81,30 @@ LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::FlowStringValue)
 namespace llvm {
 namespace yaml {
 
+struct VirtualRegisterDefinition {
+  unsigned ID;
+  StringValue Class;
+  // TODO: Serialize the virtual register hints.
+};
+
+template <> struct MappingTraits<VirtualRegisterDefinition> {
+  static void mapping(IO &YamlIO, VirtualRegisterDefinition &Reg) {
+    YamlIO.mapRequired("id", Reg.ID);
+    YamlIO.mapRequired("class", Reg.Class);
+  }
+
+  static const bool flow = true;
+};
+
 struct MachineBasicBlock {
   unsigned ID;
-  std::string Name;
+  StringValue Name;
   unsigned Alignment = 0;
   bool IsLandingPad = false;
   bool AddressTaken = false;
-  // TODO: Serialize the successor weights and liveins.
+  // TODO: Serialize the successor weights.
   std::vector<FlowStringValue> Successors;
-
+  std::vector<FlowStringValue> LiveIns;
   std::vector<StringValue> Instructions;
 };
 
@@ -97,23 +112,153 @@ template <> struct MappingTraits<MachineBasicBlock> {
   static void mapping(IO &YamlIO, MachineBasicBlock &MBB) {
     YamlIO.mapRequired("id", MBB.ID);
     YamlIO.mapOptional("name", MBB.Name,
-                       std::string()); // Don't print out an empty name.
+                       StringValue()); // Don't print out an empty name.
     YamlIO.mapOptional("alignment", MBB.Alignment);
     YamlIO.mapOptional("isLandingPad", MBB.IsLandingPad);
     YamlIO.mapOptional("addressTaken", MBB.AddressTaken);
     YamlIO.mapOptional("successors", MBB.Successors);
+    YamlIO.mapOptional("liveins", MBB.LiveIns);
     YamlIO.mapOptional("instructions", MBB.Instructions);
   }
 };
 
+/// Serializable representation of stack object from the MachineFrameInfo class.
+///
+/// The flags 'isImmutable' and 'isAliased' aren't serialized, as they are
+/// determined by the object's type and frame information flags.
+/// Dead stack objects aren't serialized.
+///
+/// TODO: Determine isPreallocated flag by mapping between objects and local
+/// objects (Serialize local objects).
+struct MachineStackObject {
+  enum ObjectType { DefaultType, SpillSlot, VariableSized };
+  // TODO: Serialize LLVM alloca reference.
+  unsigned ID;
+  ObjectType Type = DefaultType;
+  int64_t Offset = 0;
+  uint64_t Size = 0;
+  unsigned Alignment = 0;
+};
+
+template <> struct ScalarEnumerationTraits<MachineStackObject::ObjectType> {
+  static void enumeration(yaml::IO &IO, MachineStackObject::ObjectType &Type) {
+    IO.enumCase(Type, "default", MachineStackObject::DefaultType);
+    IO.enumCase(Type, "spill-slot", MachineStackObject::SpillSlot);
+    IO.enumCase(Type, "variable-sized", MachineStackObject::VariableSized);
+  }
+};
+
+template <> struct MappingTraits<MachineStackObject> {
+  static void mapping(yaml::IO &YamlIO, MachineStackObject &Object) {
+    YamlIO.mapRequired("id", Object.ID);
+    YamlIO.mapOptional(
+        "type", Object.Type,
+        MachineStackObject::DefaultType); // Don't print the default type.
+    YamlIO.mapOptional("offset", Object.Offset);
+    if (Object.Type != MachineStackObject::VariableSized)
+      YamlIO.mapRequired("size", Object.Size);
+    YamlIO.mapOptional("alignment", Object.Alignment);
+  }
+
+  static const bool flow = true;
+};
+
+/// Serializable representation of the fixed stack object from the
+/// MachineFrameInfo class.
+struct FixedMachineStackObject {
+  enum ObjectType { DefaultType, SpillSlot };
+  unsigned ID;
+  ObjectType Type = DefaultType;
+  int64_t Offset = 0;
+  uint64_t Size = 0;
+  unsigned Alignment = 0;
+  bool IsImmutable = false;
+  bool IsAliased = false;
+};
+
+template <>
+struct ScalarEnumerationTraits<FixedMachineStackObject::ObjectType> {
+  static void enumeration(yaml::IO &IO,
+                          FixedMachineStackObject::ObjectType &Type) {
+    IO.enumCase(Type, "default", FixedMachineStackObject::DefaultType);
+    IO.enumCase(Type, "spill-slot", FixedMachineStackObject::SpillSlot);
+  }
+};
+
+template <> struct MappingTraits<FixedMachineStackObject> {
+  static void mapping(yaml::IO &YamlIO, FixedMachineStackObject &Object) {
+    YamlIO.mapRequired("id", Object.ID);
+    YamlIO.mapOptional(
+        "type", Object.Type,
+        FixedMachineStackObject::DefaultType); // Don't print the default type.
+    YamlIO.mapOptional("offset", Object.Offset);
+    YamlIO.mapOptional("size", Object.Size);
+    YamlIO.mapOptional("alignment", Object.Alignment);
+    if (Object.Type != FixedMachineStackObject::SpillSlot) {
+      YamlIO.mapOptional("isImmutable", Object.IsImmutable);
+      YamlIO.mapOptional("isAliased", Object.IsAliased);
+    }
+  }
+
+  static const bool flow = true;
+};
+
 } // end namespace yaml
 } // end namespace llvm
 
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::VirtualRegisterDefinition)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineBasicBlock)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineStackObject)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::FixedMachineStackObject)
 
 namespace llvm {
 namespace yaml {
 
+/// Serializable representation of MachineFrameInfo.
+///
+/// Doesn't serialize attributes like 'StackAlignment', 'IsStackRealignable' and
+/// 'RealignOption' as they are determined by the target and LLVM function
+/// attributes.
+/// It also doesn't serialize attributes like 'NumFixedObject' and
+/// 'HasVarSizedObjects' as they are determined by the frame objects themselves.
+struct MachineFrameInfo {
+  bool IsFrameAddressTaken = false;
+  bool IsReturnAddressTaken = false;
+  bool HasStackMap = false;
+  bool HasPatchPoint = false;
+  uint64_t StackSize = 0;
+  int OffsetAdjustment = 0;
+  unsigned MaxAlignment = 0;
+  bool AdjustsStack = false;
+  bool HasCalls = false;
+  // TODO: Serialize StackProtectorIdx and FunctionContextIdx
+  unsigned MaxCallFrameSize = 0;
+  // TODO: Serialize callee saved info.
+  // TODO: Serialize local frame objects.
+  bool HasOpaqueSPAdjustment = false;
+  bool HasVAStart = false;
+  bool HasMustTailInVarArgFunc = false;
+  // TODO: Serialize save and restore MBB references.
+};
+
+template <> struct MappingTraits<MachineFrameInfo> {
+  static void mapping(IO &YamlIO, MachineFrameInfo &MFI) {
+    YamlIO.mapOptional("isFrameAddressTaken", MFI.IsFrameAddressTaken);
+    YamlIO.mapOptional("isReturnAddressTaken", MFI.IsReturnAddressTaken);
+    YamlIO.mapOptional("hasStackMap", MFI.HasStackMap);
+    YamlIO.mapOptional("hasPatchPoint", MFI.HasPatchPoint);
+    YamlIO.mapOptional("stackSize", MFI.StackSize);
+    YamlIO.mapOptional("offsetAdjustment", MFI.OffsetAdjustment);
+    YamlIO.mapOptional("maxAlignment", MFI.MaxAlignment);
+    YamlIO.mapOptional("adjustsStack", MFI.AdjustsStack);
+    YamlIO.mapOptional("hasCalls", MFI.HasCalls);
+    YamlIO.mapOptional("maxCallFrameSize", MFI.MaxCallFrameSize);
+    YamlIO.mapOptional("hasOpaqueSPAdjustment", MFI.HasOpaqueSPAdjustment);
+    YamlIO.mapOptional("hasVAStart", MFI.HasVAStart);
+    YamlIO.mapOptional("hasMustTailInVarArgFunc", MFI.HasMustTailInVarArgFunc);
+  }
+};
+
 struct MachineFunction {
   StringRef Name;
   unsigned Alignment = 0;
@@ -123,9 +268,13 @@ struct MachineFunction {
   bool IsSSA = false;
   bool TracksRegLiveness = false;
   bool TracksSubRegLiveness = false;
-  // TODO: Serialize virtual register definitions.
+  std::vector<VirtualRegisterDefinition> VirtualRegisters;
   // TODO: Serialize the various register masks.
   // TODO: Serialize live in registers.
+  // Frame information
+  MachineFrameInfo FrameInfo;
+  std::vector<FixedMachineStackObject> FixedStackObjects;
+  std::vector<MachineStackObject> StackObjects;
 
   std::vector<MachineBasicBlock> BasicBlocks;
 };
@@ -139,6 +288,10 @@ template <> struct MappingTraits<MachineFunction> {
     YamlIO.mapOptional("isSSA", MF.IsSSA);
     YamlIO.mapOptional("tracksRegLiveness", MF.TracksRegLiveness);
     YamlIO.mapOptional("tracksSubRegLiveness", MF.TracksSubRegLiveness);
+    YamlIO.mapOptional("registers", MF.VirtualRegisters);
+    YamlIO.mapOptional("frameInfo", MF.FrameInfo);
+    YamlIO.mapOptional("fixedStack", MF.FixedStackObjects);
+    YamlIO.mapOptional("stack", MF.StackObjects);
     YamlIO.mapOptional("body", MF.BasicBlocks);
   }
 };
diff --git a/include/llvm/CodeGen/MachineConstantPool.h b/include/llvm/CodeGen/MachineConstantPool.h
index c619afb83333..628400322f60 100644
--- a/include/llvm/CodeGen/MachineConstantPool.h
+++ b/include/llvm/CodeGen/MachineConstantPool.h
@@ -135,17 +135,18 @@ public:
 /// address of the function constant pool values.
 /// @brief The machine constant pool.
 class MachineConstantPool {
-  const TargetMachine &TM;      ///< The target machine.
   unsigned PoolAlignment;       ///< The alignment for the pool.
   std::vector<MachineConstantPoolEntry> Constants; ///< The pool of constants.
   /// MachineConstantPoolValues that use an existing MachineConstantPoolEntry.
   DenseSet<MachineConstantPoolValue*> MachineCPVsSharingEntries;
+  const DataLayout &DL;
+
+  const DataLayout &getDataLayout() const { return DL; }
 
-  const DataLayout *getDataLayout() const;
 public:
   /// @brief The only constructor.
-  explicit MachineConstantPool(const TargetMachine &TM)
-    : TM(TM), PoolAlignment(1) {}
+  explicit MachineConstantPool(const DataLayout &DL)
+      : PoolAlignment(1), DL(DL) {}
   ~MachineConstantPool();
     
   /// getConstantPoolAlignment - Return the alignment required by
diff --git a/include/llvm/CodeGen/MachineDominators.h b/include/llvm/CodeGen/MachineDominators.h
index 4428fa618fb0..735dd069cf7f 100644
--- a/include/llvm/CodeGen/MachineDominators.h
+++ b/include/llvm/CodeGen/MachineDominators.h
@@ -29,8 +29,8 @@ inline void DominatorTreeBase<MachineBasicBlock>::addRoot(MachineBasicBlock* MBB
   this->Roots.push_back(MBB);
 }
 
-EXTERN_TEMPLATE_INSTANTIATION(class DomTreeNodeBase<MachineBasicBlock>);
-EXTERN_TEMPLATE_INSTANTIATION(class DominatorTreeBase<MachineBasicBlock>);
+extern template class DomTreeNodeBase<MachineBasicBlock>;
+extern template class DominatorTreeBase<MachineBasicBlock>;
 
 typedef DomTreeNodeBase<MachineBasicBlock> MachineDomTreeNode;
 
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 0f5a4b1b09ec..cbc4e66ccc46 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -229,9 +229,9 @@ class MachineFrameInfo {
   /// Whether the "realign-stack" option is on.
   bool RealignOption;
 
-  /// True if the function includes inline assembly that adjusts the stack
-  /// pointer.
-  bool HasInlineAsmWithSPAdjust;
+  /// True if the function dynamically adjusts the stack pointer through some
+  /// opaque mechanism like inline assembly or Win32 EH.
+  bool HasOpaqueSPAdjustment;
 
   /// True if the function contains a call to the llvm.vastart intrinsic.
   bool HasVAStart;
@@ -269,7 +269,7 @@ public:
     LocalFrameSize = 0;
     LocalFrameMaxAlign = 0;
     UseLocalStackAllocationBlock = false;
-    HasInlineAsmWithSPAdjust = false;
+    HasOpaqueSPAdjustment = false;
     HasVAStart = false;
     HasMustTailInVarArgFunc = false;
     Save = nullptr;
@@ -468,9 +468,9 @@ public:
   bool hasCalls() const { return HasCalls; }
   void setHasCalls(bool V) { HasCalls = V; }
 
-  /// Returns true if the function contains any stack-adjusting inline assembly.
-  bool hasInlineAsmWithSPAdjust() const { return HasInlineAsmWithSPAdjust; }
-  void setHasInlineAsmWithSPAdjust(bool B) { HasInlineAsmWithSPAdjust = B; }
+  /// Returns true if the function contains opaque dynamic stack adjustments.
+  bool hasOpaqueSPAdjustment() const { return HasOpaqueSPAdjustment; }
+  void setHasOpaqueSPAdjustment(bool B) { HasOpaqueSPAdjustment = B; }
 
   /// Returns true if the function calls the llvm.va_start intrinsic.
   bool hasVAStart() const { return HasVAStart; }
@@ -541,6 +541,14 @@ public:
     return Objects[ObjectIdx+NumFixedObjects].Size == ~0ULL;
   }
 
+  /// Returns true if the specified index corresponds to a variable sized
+  /// object.
+  bool isVariableSizedObjectIndex(int ObjectIdx) const {
+    assert(unsigned(ObjectIdx + NumFixedObjects) < Objects.size() &&
+           "Invalid Object Idx!");
+    return Objects[ObjectIdx + NumFixedObjects].Size == 0;
+  }
+
   /// Create a new statically sized stack object, returning
   /// a nonnegative identifier to represent it.
   int CreateStackObject(uint64_t Size, unsigned Alignment, bool isSS,
diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 94610cabf566..c15ee1c006cd 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -155,6 +155,9 @@ public:
   MachineModuleInfo &getMMI() const { return MMI; }
   MCContext &getContext() const { return Ctx; }
 
+  /// Return the DataLayout attached to the Module associated to this MF.
+  const DataLayout &getDataLayout() const;
+
   /// getFunction - Return the LLVM function that this machine code represents
   ///
   const Function *getFunction() const { return Fn; }
diff --git a/include/llvm/CodeGen/MachineLoopInfo.h b/include/llvm/CodeGen/MachineLoopInfo.h
index 438ef2e37255..4868b7363f82 100644
--- a/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/include/llvm/CodeGen/MachineLoopInfo.h
@@ -37,10 +37,8 @@
 namespace llvm {
 
 // Implementation in LoopInfoImpl.h
-#ifdef __GNUC__
 class MachineLoop;
-__extension__ extern template class LoopBase<MachineBasicBlock, MachineLoop>;
-#endif
+extern template class LoopBase<MachineBasicBlock, MachineLoop>;
 
 class MachineLoop : public LoopBase<MachineBasicBlock, MachineLoop> {
 public:
@@ -65,10 +63,7 @@ private:
 };
 
 // Implementation in LoopInfoImpl.h
-#ifdef __GNUC__
-__extension__ extern template
-class LoopInfoBase<MachineBasicBlock, MachineLoop>;
-#endif
+extern template class LoopInfoBase<MachineBasicBlock, MachineLoop>;
 
 class MachineLoopInfo : public MachineFunctionPass {
   LoopInfoBase<MachineBasicBlock, MachineLoop> LI;
diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index ccaa83a238a6..4cdfe2463c99 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -320,6 +320,7 @@ public:
   /// information.
   void addPersonality(MachineBasicBlock *LandingPad,
                       const Function *Personality);
+  void addPersonality(const Function *Personality);
 
   void addWinEHState(MachineBasicBlock *LandingPad, int State);
 
diff --git a/include/llvm/CodeGen/MachineRegionInfo.h b/include/llvm/CodeGen/MachineRegionInfo.h
index cf49c297c288..df9823f741dc 100644
--- a/include/llvm/CodeGen/MachineRegionInfo.h
+++ b/include/llvm/CodeGen/MachineRegionInfo.h
@@ -172,10 +172,9 @@ template <> struct GraphTraits<MachineRegionInfoPass*>
   }
 };
 
-EXTERN_TEMPLATE_INSTANTIATION(class RegionBase<RegionTraits<MachineFunction>>);
-EXTERN_TEMPLATE_INSTANTIATION(class RegionNodeBase<RegionTraits<MachineFunction>>);
-EXTERN_TEMPLATE_INSTANTIATION(class RegionInfoBase<RegionTraits<MachineFunction>>);
-
+extern template class RegionBase<RegionTraits<MachineFunction>>;
+extern template class RegionNodeBase<RegionTraits<MachineFunction>>;
+extern template class RegionInfoBase<RegionTraits<MachineFunction>>;
 }
 
 #endif
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index e5b837aeea28..67583be616c3 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -95,20 +95,8 @@ private:
     return MO->Contents.Reg.Next;
   }
 
-  /// UsedRegUnits - This is a bit vector that is computed and set by the
-  /// register allocator, and must be kept up to date by passes that run after
-  /// register allocation (though most don't modify this).  This is used
-  /// so that the code generator knows which callee save registers to save and
-  /// for other target specific uses.
-  /// This vector has bits set for register units that are modified in the
-  /// current function. It doesn't include registers clobbered by function
-  /// calls with register mask operands.
-  BitVector UsedRegUnits;
-
   /// UsedPhysRegMask - Additional used physregs including aliases.
   /// This bit vector represents all the registers clobbered by function calls.
-  /// It can model things that UsedRegUnits can't, such as function calls that
-  /// clobber ymm7 but preserve the low half in xmm7.
   BitVector UsedPhysRegMask;
 
   /// ReservedRegs - This is a bit vector of reserved registers.  The target
@@ -647,40 +635,11 @@ public:
   /// deleted during LiveDebugVariables analysis.
   void markUsesInDebugValueAsUndef(unsigned Reg) const;
 
-  //===--------------------------------------------------------------------===//
-  // Physical Register Use Info
-  //===--------------------------------------------------------------------===//
-
-  /// isPhysRegUsed - Return true if the specified register is used in this
-  /// function. Also check for clobbered aliases and registers clobbered by
-  /// function calls with register mask operands.
-  ///
-  /// This only works after register allocation. It is primarily used by
-  /// PrologEpilogInserter to determine which callee-saved registers need
-  /// spilling.
-  bool isPhysRegUsed(unsigned Reg) const {
-    if (UsedPhysRegMask.test(Reg))
-      return true;
-    for (MCRegUnitIterator Units(Reg, getTargetRegisterInfo());
-         Units.isValid(); ++Units)
-      if (UsedRegUnits.test(*Units))
-        return true;
-    return false;
-  }
-
-  /// Mark the specified register unit as used in this function.
-  /// This should only be called during and after register allocation.
-  void setRegUnitUsed(unsigned RegUnit) {
-    UsedRegUnits.set(RegUnit);
-  }
-
-  /// setPhysRegUsed - Mark the specified register used in this function.
-  /// This should only be called during and after register allocation.
-  void setPhysRegUsed(unsigned Reg) {
-    for (MCRegUnitIterator Units(Reg, getTargetRegisterInfo());
-         Units.isValid(); ++Units)
-      UsedRegUnits.set(*Units);
-  }
+  /// Return true if the specified register is modified in this function.
+  /// This checks that no defining machine operands exist for the register or
+  /// any of its aliases. Definitions found on functions marked noreturn are
+  /// ignored.
+  bool isPhysRegModified(unsigned PhysReg) const;
 
   /// addPhysRegsUsedFromRegMask - Mark any registers not in RegMask as used.
   /// This corresponds to the bit mask attached to register mask operands.
@@ -688,16 +647,6 @@ public:
     UsedPhysRegMask.setBitsNotInMask(RegMask);
   }
 
-  /// setPhysRegUnused - Mark the specified register unused in this function.
-  /// This should only be called during and after register allocation.
-  void setPhysRegUnused(unsigned Reg) {
-    UsedPhysRegMask.reset(Reg);
-    for (MCRegUnitIterator Units(Reg, getTargetRegisterInfo());
-         Units.isValid(); ++Units)
-      UsedRegUnits.reset(*Units);
-  }
-
-
   //===--------------------------------------------------------------------===//
   // Reserved Register Info
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 538c995a7b44..5d8292174476 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -101,7 +101,7 @@ public:
 
 private:
   PassManagerBase *PM;
-  AnalysisID StartAfter;
+  AnalysisID StartBefore, StartAfter;
   AnalysisID StopAfter;
   bool Started;
   bool Stopped;
@@ -142,16 +142,24 @@ public:
 
   CodeGenOpt::Level getOptLevel() const { return TM->getOptLevel(); }
 
-  /// setStartStopPasses - Set the StartAfter and StopAfter passes to allow
-  /// running only a portion of the normal code-gen pass sequence.  If the
-  /// Start pass ID is zero, then compilation will begin at the normal point;
-  /// otherwise, clear the Started flag to indicate that passes should not be
-  /// added until the starting pass is seen.  If the Stop pass ID is zero,
-  /// then compilation will continue to the end.
-  void setStartStopPasses(AnalysisID Start, AnalysisID Stop) {
-    StartAfter = Start;
-    StopAfter = Stop;
-    Started = (StartAfter == nullptr);
+  /// Set the StartAfter, StartBefore and StopAfter passes to allow running only
+  /// a portion of the normal code-gen pass sequence.
+  ///
+  /// If the StartAfter and StartBefore pass ID is zero, then compilation will
+  /// begin at the normal point; otherwise, clear the Started flag to indicate
+  /// that passes should not be added until the starting pass is seen.  If the
+  /// Stop pass ID is zero, then compilation will continue to the end.
+  ///
+  /// This function expects that at least one of the StartAfter or the
+  /// StartBefore pass IDs is null.
+  void setStartStopPasses(AnalysisID StartBefore, AnalysisID StartAfter,
+                          AnalysisID StopAfter) {
+    if (StartAfter)
+      assert(!StartBefore && "Start after and start before passes are given");
+    this->StartBefore = StartBefore;
+    this->StartAfter = StartAfter;
+    this->StopAfter = StopAfter;
+    Started = (StartAfter == nullptr) && (StartBefore == nullptr);
   }
 
   void setDisableVerify(bool Disable) { setOpt(DisableVerify, Disable); }
@@ -597,7 +605,7 @@ namespace llvm {
   /// createSjLjEHPreparePass - This pass adapts exception handling code to use
   /// the GCC-style builtin setjmp/longjmp (sjlj) to handling EH control flow.
   ///
-  FunctionPass *createSjLjEHPreparePass(const TargetMachine *TM);
+  FunctionPass *createSjLjEHPreparePass();
 
   /// LocalStackSlotAllocation - This pass assigns local frame indices to stack
   /// slots relative to one another and allocates base registers to access them
diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index fcb6feed68ca..9d8843d1d74a 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -135,6 +135,8 @@ public:
 
   void addPressureChange(unsigned RegUnit, bool IsDec,
                          const MachineRegisterInfo *MRI);
+
+  LLVM_DUMP_METHOD void dump(const TargetRegisterInfo &TRI) const;
 };
 
 /// Array of PressureDiffs.
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index c2b1243ee26e..1ee92380e690 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -281,6 +281,7 @@ public:
   void clear();
 
   MachineFunction &getMachineFunction() const { return *MF; }
+  const DataLayout &getDataLayout() const { return MF->getDataLayout(); }
   const TargetMachine &getTarget() const { return TM; }
   const TargetSubtargetInfo &getSubtarget() const { return MF->getSubtarget(); }
   const TargetLowering &getTargetLoweringInfo() const { return *TLI; }
@@ -322,6 +323,14 @@ public:
     return AllNodes.size();
   }
 
+  iterator_range<allnodes_iterator> allnodes() {
+    return iterator_range<allnodes_iterator>(allnodes_begin(), allnodes_end());
+  }
+  iterator_range<allnodes_const_iterator> allnodes() const {
+    return iterator_range<allnodes_const_iterator>(allnodes_begin(),
+                                                   allnodes_end());
+  }
+
   /// Return the root tag of the SelectionDAG.
   const SDValue &getRoot() const { return Root; }
 
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 619119096d20..4821d1aae9e5 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -140,7 +140,7 @@ public:
   }
 
   // Return true if this node is an operand of N.
-  bool isOperandOf(SDNode *N) const;
+  bool isOperandOf(const SDNode *N) const;
 
   /// Return the ValueType of the referenced return value.
   inline EVT getValueType() const;
@@ -357,9 +357,6 @@ private:
   /// The number of entries in the Operand/Value list.
   unsigned short NumOperands, NumValues;
 
-  /// Source line information.
-  DebugLoc debugLoc;
-
   // The ordering of the SDNodes. It roughly corresponds to the ordering of the
   // original LLVM instructions.
   // This is used for turning off scheduling, because we'll forgo
@@ -367,6 +364,9 @@ private:
   // this ordering.
   unsigned IROrder;
 
+  /// Source line information.
+  DebugLoc debugLoc;
+
   /// Return a pointer to the specified value type.
   static const EVT *getValueTypeList(EVT VT);
 
@@ -532,10 +532,10 @@ public:
   bool hasAnyUseOfValue(unsigned Value) const;
 
   /// Return true if this node is the only use of N.
-  bool isOnlyUserOf(SDNode *N) const;
+  bool isOnlyUserOf(const SDNode *N) const;
 
   /// Return true if this node is an operand of N.
-  bool isOperandOf(SDNode *N) const;
+  bool isOperandOf(const SDNode *N) const;
 
   /// Return true if this node is a predecessor of N.
   /// NOTE: Implemented on top of hasPredecessor and every bit as
@@ -732,7 +732,7 @@ protected:
         SubclassData(0), NodeId(-1),
         OperandList(Ops.size() ? new SDUse[Ops.size()] : nullptr),
         ValueList(VTs.VTs), UseList(nullptr), NumOperands(Ops.size()),
-        NumValues(VTs.NumVTs), debugLoc(std::move(dl)), IROrder(Order) {
+        NumValues(VTs.NumVTs), IROrder(Order), debugLoc(std::move(dl)) {
     assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
     assert(NumOperands == Ops.size() &&
            "NumOperands wasn't wide enough for its operands!");
@@ -752,7 +752,7 @@ protected:
       : NodeType(Opc), OperandsNeedDelete(false), HasDebugValue(false),
         SubclassData(0), NodeId(-1), OperandList(nullptr), ValueList(VTs.VTs),
         UseList(nullptr), NumOperands(0), NumValues(VTs.NumVTs),
-        debugLoc(std::move(dl)), IROrder(Order) {
+        IROrder(Order), debugLoc(std::move(dl)) {
     assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
     assert(NumValues == VTs.NumVTs &&
            "NumValues wasn't wide enough for its operands!");
diff --git a/include/llvm/CodeGen/StackMaps.h b/include/llvm/CodeGen/StackMaps.h
index 46a773f74aac..fdc1a9143ed2 100644
--- a/include/llvm/CodeGen/StackMaps.h
+++ b/include/llvm/CodeGen/StackMaps.h
@@ -1,5 +1,4 @@
 //===------------------- StackMaps.h - StackMaps ----------------*- C++ -*-===//
-
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -42,10 +41,12 @@ class PatchPointOpers {
 public:
   /// Enumerate the meta operands.
   enum { IDPos, NBytesPos, TargetPos, NArgPos, CCPos, MetaEnd };
+
 private:
   const MachineInstr *MI;
   bool HasDef;
   bool IsAnyReg;
+
 public:
   explicit PatchPointOpers(const MachineInstr *MI);
 
@@ -66,8 +67,8 @@ public:
   /// Get the operand index of the variable list of non-argument operands.
   /// These hold the "live state".
   unsigned getVarIdx() const {
-    return getMetaIdx() + MetaEnd
-      + MI->getOperand(getMetaIdx(NArgPos)).getImm();
+    return getMetaIdx() + MetaEnd +
+           MI->getOperand(getMetaIdx(NArgPos)).getImm();
   }
 
   /// Get the index at which stack map locations will be recorded.
@@ -98,15 +99,10 @@ private:
 
   // These values are relative offests from the start of the statepoint meta
   // arguments (i.e. the end of the call arguments).
-  enum {
-    CCOffset = 1,
-    FlagsOffset = 3,
-    NumVMSArgsOffset = 5
-  };
+  enum { CCOffset = 1, FlagsOffset = 3, NumVMSArgsOffset = 5 };
 
 public:
-  explicit StatepointOpers(const MachineInstr *MI):
-    MI(MI) { }
+  explicit StatepointOpers(const MachineInstr *MI) : MI(MI) {}
 
   /// Get starting index of non call related arguments
   /// (calling convention, statepoint flags, vm state and gc state).
@@ -134,31 +130,32 @@ private:
 class StackMaps {
 public:
   struct Location {
-    enum LocationType { Unprocessed, Register, Direct, Indirect, Constant,
-                        ConstantIndex };
-    LocationType LocType;
+    enum LocationType {
+      Unprocessed,
+      Register,
+      Direct,
+      Indirect,
+      Constant,
+      ConstantIndex
+    };
+    LocationType Type;
     unsigned Size;
     unsigned Reg;
     int64_t Offset;
-    Location() : LocType(Unprocessed), Size(0), Reg(0), Offset(0) {}
-    Location(LocationType LocType, unsigned Size, unsigned Reg, int64_t Offset)
-      : LocType(LocType), Size(Size), Reg(Reg), Offset(Offset) {}
+    Location() : Type(Unprocessed), Size(0), Reg(0), Offset(0) {}
+    Location(LocationType Type, unsigned Size, unsigned Reg, int64_t Offset)
+        : Type(Type), Size(Size), Reg(Reg), Offset(Offset) {}
   };
 
   struct LiveOutReg {
     unsigned short Reg;
-    unsigned short RegNo;
+    unsigned short DwarfRegNum;
     unsigned short Size;
 
-    LiveOutReg() : Reg(0), RegNo(0), Size(0) {}
-    LiveOutReg(unsigned short Reg, unsigned short RegNo, unsigned short Size)
-      : Reg(Reg), RegNo(RegNo), Size(Size) {}
-
-    void MarkInvalid() { Reg = 0; }
-
-    // Only sort by the dwarf register number.
-    bool operator< (const LiveOutReg &LO) const { return RegNo < LO.RegNo; }
-    static bool IsInvalid(const LiveOutReg &LO) { return LO.Reg == 0; }
+    LiveOutReg() : Reg(0), DwarfRegNum(0), Size(0) {}
+    LiveOutReg(unsigned short Reg, unsigned short DwarfRegNum,
+               unsigned short Size)
+        : Reg(Reg), DwarfRegNum(DwarfRegNum), Size(Size) {}
   };
 
   // OpTypes are used to encode information about the following logical
@@ -205,8 +202,8 @@ private:
     CallsiteInfo() : CSOffsetExpr(nullptr), ID(0) {}
     CallsiteInfo(const MCExpr *CSOffsetExpr, uint64_t ID,
                  LocationVec &&Locations, LiveOutVec &&LiveOuts)
-      : CSOffsetExpr(CSOffsetExpr), ID(ID), Locations(std::move(Locations)),
-        LiveOuts(std::move(LiveOuts)) {}
+        : CSOffsetExpr(CSOffsetExpr), ID(ID), Locations(std::move(Locations)),
+          LiveOuts(std::move(LiveOuts)) {}
   };
 
   typedef std::vector<CallsiteInfo> CallsiteInfoList;
@@ -218,8 +215,8 @@ private:
 
   MachineInstr::const_mop_iterator
   parseOperand(MachineInstr::const_mop_iterator MOI,
-               MachineInstr::const_mop_iterator MOE,
-               LocationVec &Locs, LiveOutVec &LiveOuts) const;
+               MachineInstr::const_mop_iterator MOE, LocationVec &Locs,
+               LiveOutVec &LiveOuts) const;
 
   /// \brief Create a live-out register record for the given register @p Reg.
   LiveOutReg createLiveOutReg(unsigned Reg,
@@ -254,7 +251,6 @@ private:
   void print(raw_ostream &OS);
   void debug() { print(dbgs()); }
 };
-
 }
 
 #endif
diff --git a/include/llvm/CodeGen/WinEHFuncInfo.h b/include/llvm/CodeGen/WinEHFuncInfo.h
index 291f3905512c..75638a058a30 100644
--- a/include/llvm/CodeGen/WinEHFuncInfo.h
+++ b/include/llvm/CodeGen/WinEHFuncInfo.h
@@ -91,7 +91,7 @@ private:
   // When the parseEHActions function is called to populate a vector of
   // instances of this class, the ExceptionObjectVar field will be nullptr
   // and the ExceptionObjectIndex will be the index of the exception object in
-  // the parent function's frameescape block.
+  // the parent function's localescape block.
   const Value *ExceptionObjectVar;
   int ExceptionObjectIndex;
   TinyPtrVector<BasicBlock *> ReturnTargets;
@@ -148,7 +148,7 @@ struct WinEHFuncInfo {
   int UnwindHelpFrameOffset = -1;
   unsigned NumIPToStateFuncsVisited = 0;
 
-  /// frameescape index of the 32-bit EH registration node. Set by
+  /// localescape index of the 32-bit EH registration node. Set by
   /// WinEHStatePass and used indirectly by SEH filter functions of the parent.
   int EHRegNodeEscapeIndex = INT_MAX;
 
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index e8af601d83b6..821c0181ce83 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -31,6 +31,7 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <functional>
 
 namespace llvm {
 
@@ -89,6 +90,8 @@ public:
   uint64_t RemoveMapping(StringRef Name);
 };
 
+using FunctionCreator = std::function<void *(const std::string &)>;
+
 /// \brief Abstract interface for implementation execution of LLVM modules,
 /// designed to support both interpreter and just-in-time (JIT) compiler
 /// implementations.
@@ -147,7 +150,7 @@ protected:
   /// LazyFunctionCreator - If an unknown function is needed, this function
   /// pointer is invoked to create it.  If this returns null, the JIT will
   /// abort.
-  void *(*LazyFunctionCreator)(const std::string &);
+  FunctionCreator LazyFunctionCreator;
 
   /// getMangledName - Get mangled name.
   std::string getMangledName(const GlobalValue *GV);
@@ -470,8 +473,8 @@ public:
   /// InstallLazyFunctionCreator - If an unknown function is needed, the
   /// specified function pointer is invoked to create it.  If it returns null,
   /// the JIT will abort.
-  void InstallLazyFunctionCreator(void* (*P)(const std::string &)) {
-    LazyFunctionCreator = P;
+  void InstallLazyFunctionCreator(FunctionCreator C) {
+    LazyFunctionCreator = C;
   }
 
 protected:
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index 94c4038e8183..a808d9231167 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -153,6 +153,10 @@ public:
 
     /// This method returns the address of the specified function or variable.
     /// It is used to resolve symbols during module linking.
+    ///
+    /// If the returned symbol's address is equal to ~0ULL then RuntimeDyld will
+    /// skip all relocations for that symbol, and the client will be responsible
+    /// for handling them manually.
     virtual SymbolInfo findSymbol(const std::string &Name) = 0;
 
     /// This method returns the address of the specified symbol if it exists
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index 366bf709ab16..4d6d7da1fa5b 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -98,6 +98,8 @@ public:
     OptimizeNone,          ///< Function must not be optimized.
     ReadNone,              ///< Function does not access memory
     ReadOnly,              ///< Function only reads from memory
+    ArgMemOnly,            ///< Funciton can access memory only using pointers
+                           ///< based on its arguments.
     Returned,              ///< Return value is always equal to this argument
     ReturnsTwice,          ///< Function can return twice
     SExt,                  ///< Sign extended before/after call
diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h
index dd2903e807e1..2841781e8a9e 100644
--- a/include/llvm/IR/CallSite.h
+++ b/include/llvm/IR/CallSite.h
@@ -290,6 +290,15 @@ public:
     CALLSITE_DELEGATE_SETTER(setOnlyReadsMemory());
   }
 
+  /// @brief Determine if the call can access memmory only using pointers based
+  /// on its arguments.
+  bool onlyAccessesArgMemory() const {
+    CALLSITE_DELEGATE_GETTER(onlyAccessesArgMemory());
+  }
+  void setOnlyAccessesArgMemory() {
+    CALLSITE_DELEGATE_SETTER(setOnlyAccessesArgMemory());
+  }
+
   /// @brief Determine if the call cannot return.
   bool doesNotReturn() const {
     CALLSITE_DELEGATE_GETTER(doesNotReturn());
diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h
index d6296b622aab..aa43c02d5cd8 100644
--- a/include/llvm/IR/DIBuilder.h
+++ b/include/llvm/IR/DIBuilder.h
@@ -47,7 +47,7 @@ namespace llvm {
     SmallVector<Metadata *, 4> AllGVs;
     SmallVector<TrackingMDNodeRef, 4> AllImportedModules;
 
-    /// \brief Track nodes that may be unresolved.
+    /// Track nodes that may be unresolved.
     SmallVector<TrackingMDNodeRef, 4> UnresolvedNodes;
     bool AllowUnresolvedNodes;
 
@@ -57,49 +57,52 @@ namespace llvm {
     DIBuilder(const DIBuilder &) = delete;
     void operator=(const DIBuilder &) = delete;
 
-    /// \brief Create a temporary.
+    /// Create a temporary.
     ///
     /// Create an \a temporary node and track it in \a UnresolvedNodes.
     void trackIfUnresolved(MDNode *N);
 
   public:
-    /// \brief Construct a builder for a module.
+    /// Construct a builder for a module.
     ///
     /// If \c AllowUnresolved, collect unresolved nodes attached to the module
     /// in order to resolve cycles during \a finalize().
     explicit DIBuilder(Module &M, bool AllowUnresolved = true);
     enum DebugEmissionKind { FullDebug=1, LineTablesOnly };
 
-    /// finalize - Construct any deferred debug info descriptors.
+    /// Construct any deferred debug info descriptors.
     void finalize();
 
-    /// createCompileUnit - A CompileUnit provides an anchor for all debugging
+    /// A CompileUnit provides an anchor for all debugging
     /// information generated during this instance of compilation.
-    /// @param Lang     Source programming language, eg. dwarf::DW_LANG_C99
-    /// @param File     File name
-    /// @param Dir      Directory
-    /// @param Producer Identify the producer of debugging information and code.
-    ///                 Usually this is a compiler version string.
-    /// @param isOptimized A boolean flag which indicates whether optimization
-    ///                    is ON or not.
-    /// @param Flags    This string lists command line options. This string is
-    ///                 directly embedded in debug info output which may be used
-    ///                 by a tool analyzing generated debugging information.
-    /// @param RV       This indicates runtime version for languages like
-    ///                 Objective-C.
-    /// @param SplitName The name of the file that we'll split debug info out
-    ///                  into.
-    /// @param Kind     The kind of debug information to generate.
-    /// @param DWOId    The DWOId if this is a split skeleton compile unit.
-    /// @param EmitDebugInfo   A boolean flag which indicates whether debug
-    ///                        information should be written to the final
-    ///                        output or not. When this is false, debug
-    ///                        information annotations will be present in
-    ///                        the IL but they are not written to the final
-    ///                        assembly or object file. This supports tracking
-    ///                        source location information in the back end
-    ///                        without actually changing the output (e.g.,
-    ///                        when using optimization remarks).
+    /// \param Lang          Source programming language, eg. dwarf::DW_LANG_C99
+    /// \param File          File name
+    /// \param Dir           Directory
+    /// \param Producer      Identify the producer of debugging information
+    ///                      and code.  Usually this is a compiler
+    ///                      version string.
+    /// \param isOptimized   A boolean flag which indicates whether optimization
+    ///                      is enabled or not.
+    /// \param Flags         This string lists command line options. This
+    ///                      string is directly embedded in debug info
+    ///                      output which may be used by a tool
+    ///                      analyzing generated debugging information.
+    /// \param RV            This indicates runtime version for languages like
+    ///                      Objective-C.
+    /// \param SplitName     The name of the file that we'll split debug info
+    ///                      out into.
+    /// \param Kind          The kind of debug information to generate.
+    /// \param DWOId         The DWOId if this is a split skeleton compile unit.
+    /// \param EmitDebugInfo A boolean flag which indicates whether
+    ///                      debug information should be written to
+    ///                      the final output or not. When this is
+    ///                      false, debug information annotations will
+    ///                      be present in the IL but they are not
+    ///                      written to the final assembly or object
+    ///                      file. This supports tracking source
+    ///                      location information in the back end
+    ///                      without actually changing the output
+    ///                      (e.g., when using optimization remarks).
     DICompileUnit *
     createCompileUnit(unsigned Lang, StringRef File, StringRef Dir,
                       StringRef Producer, bool isOptimized, StringRef Flags,
@@ -107,155 +110,155 @@ namespace llvm {
                       DebugEmissionKind Kind = FullDebug, uint64_t DWOId = 0,
                       bool EmitDebugInfo = true);
 
-    /// createFile - Create a file descriptor to hold debugging information
+    /// Create a file descriptor to hold debugging information
     /// for a file.
     DIFile *createFile(StringRef Filename, StringRef Directory);
 
-    /// createEnumerator - Create a single enumerator value.
+    /// Create a single enumerator value.
     DIEnumerator *createEnumerator(StringRef Name, int64_t Val);
 
-    /// \brief Create a DWARF unspecified type.
+    /// Create a DWARF unspecified type.
     DIBasicType *createUnspecifiedType(StringRef Name);
 
-    /// \brief Create C++11 nullptr type.
+    /// Create C++11 nullptr type.
     DIBasicType *createNullPtrType();
 
-    /// createBasicType - Create debugging information entry for a basic
+    /// Create debugging information entry for a basic
     /// type.
-    /// @param Name        Type name.
-    /// @param SizeInBits  Size of the type.
-    /// @param AlignInBits Type alignment.
-    /// @param Encoding    DWARF encoding code, e.g. dwarf::DW_ATE_float.
+    /// \param Name        Type name.
+    /// \param SizeInBits  Size of the type.
+    /// \param AlignInBits Type alignment.
+    /// \param Encoding    DWARF encoding code, e.g. dwarf::DW_ATE_float.
     DIBasicType *createBasicType(StringRef Name, uint64_t SizeInBits,
                                  uint64_t AlignInBits, unsigned Encoding);
 
-    /// createQualifiedType - Create debugging information entry for a qualified
+    /// Create debugging information entry for a qualified
     /// type, e.g. 'const int'.
-    /// @param Tag         Tag identifing type, e.g. dwarf::TAG_volatile_type
-    /// @param FromTy      Base Type.
+    /// \param Tag         Tag identifing type, e.g. dwarf::TAG_volatile_type
+    /// \param FromTy      Base Type.
     DIDerivedType *createQualifiedType(unsigned Tag, DIType *FromTy);
 
-    /// createPointerType - Create debugging information entry for a pointer.
-    /// @param PointeeTy   Type pointed by this pointer.
-    /// @param SizeInBits  Size.
-    /// @param AlignInBits Alignment. (optional)
-    /// @param Name        Pointer type name. (optional)
+    /// Create debugging information entry for a pointer.
+    /// \param PointeeTy   Type pointed by this pointer.
+    /// \param SizeInBits  Size.
+    /// \param AlignInBits Alignment. (optional)
+    /// \param Name        Pointer type name. (optional)
     DIDerivedType *createPointerType(DIType *PointeeTy, uint64_t SizeInBits,
                                      uint64_t AlignInBits = 0,
                                      StringRef Name = "");
 
-    /// \brief Create debugging information entry for a pointer to member.
-    /// @param PointeeTy Type pointed to by this pointer.
-    /// @param SizeInBits  Size.
-    /// @param AlignInBits Alignment. (optional)
-    /// @param Class Type for which this pointer points to members of.
+    /// Create debugging information entry for a pointer to member.
+    /// \param PointeeTy Type pointed to by this pointer.
+    /// \param SizeInBits  Size.
+    /// \param AlignInBits Alignment. (optional)
+    /// \param Class Type for which this pointer points to members of.
     DIDerivedType *createMemberPointerType(DIType *PointeeTy, DIType *Class,
                                            uint64_t SizeInBits,
                                            uint64_t AlignInBits = 0);
 
-    /// createReferenceType - Create debugging information entry for a c++
+    /// Create debugging information entry for a c++
     /// style reference or rvalue reference type.
     DIDerivedType *createReferenceType(unsigned Tag, DIType *RTy);
 
-    /// createTypedef - Create debugging information entry for a typedef.
-    /// @param Ty          Original type.
-    /// @param Name        Typedef name.
-    /// @param File        File where this type is defined.
-    /// @param LineNo      Line number.
-    /// @param Context     The surrounding context for the typedef.
+    /// Create debugging information entry for a typedef.
+    /// \param Ty          Original type.
+    /// \param Name        Typedef name.
+    /// \param File        File where this type is defined.
+    /// \param LineNo      Line number.
+    /// \param Context     The surrounding context for the typedef.
     DIDerivedType *createTypedef(DIType *Ty, StringRef Name, DIFile *File,
                                  unsigned LineNo, DIScope *Context);
 
-    /// createFriend - Create debugging information entry for a 'friend'.
+    /// Create debugging information entry for a 'friend'.
     DIDerivedType *createFriend(DIType *Ty, DIType *FriendTy);
 
-    /// createInheritance - Create debugging information entry to establish
+    /// Create debugging information entry to establish
     /// inheritance relationship between two types.
-    /// @param Ty           Original type.
-    /// @param BaseTy       Base type. Ty is inherits from base.
-    /// @param BaseOffset   Base offset.
-    /// @param Flags        Flags to describe inheritance attribute,
+    /// \param Ty           Original type.
+    /// \param BaseTy       Base type. Ty is inherits from base.
+    /// \param BaseOffset   Base offset.
+    /// \param Flags        Flags to describe inheritance attribute,
     ///                     e.g. private
     DIDerivedType *createInheritance(DIType *Ty, DIType *BaseTy,
                                      uint64_t BaseOffset, unsigned Flags);
 
-    /// createMemberType - Create debugging information entry for a member.
-    /// @param Scope        Member scope.
-    /// @param Name         Member name.
-    /// @param File         File where this member is defined.
-    /// @param LineNo       Line number.
-    /// @param SizeInBits   Member size.
-    /// @param AlignInBits  Member alignment.
-    /// @param OffsetInBits Member offset.
-    /// @param Flags        Flags to encode member attribute, e.g. private
-    /// @param Ty           Parent type.
+    /// Create debugging information entry for a member.
+    /// \param Scope        Member scope.
+    /// \param Name         Member name.
+    /// \param File         File where this member is defined.
+    /// \param LineNo       Line number.
+    /// \param SizeInBits   Member size.
+    /// \param AlignInBits  Member alignment.
+    /// \param OffsetInBits Member offset.
+    /// \param Flags        Flags to encode member attribute, e.g. private
+    /// \param Ty           Parent type.
     DIDerivedType *createMemberType(DIScope *Scope, StringRef Name,
                                     DIFile *File, unsigned LineNo,
                                     uint64_t SizeInBits, uint64_t AlignInBits,
                                     uint64_t OffsetInBits, unsigned Flags,
                                     DIType *Ty);
 
-    /// createStaticMemberType - Create debugging information entry for a
+    /// Create debugging information entry for a
     /// C++ static data member.
-    /// @param Scope      Member scope.
-    /// @param Name       Member name.
-    /// @param File       File where this member is declared.
-    /// @param LineNo     Line number.
-    /// @param Ty         Type of the static member.
-    /// @param Flags      Flags to encode member attribute, e.g. private.
-    /// @param Val        Const initializer of the member.
+    /// \param Scope      Member scope.
+    /// \param Name       Member name.
+    /// \param File       File where this member is declared.
+    /// \param LineNo     Line number.
+    /// \param Ty         Type of the static member.
+    /// \param Flags      Flags to encode member attribute, e.g. private.
+    /// \param Val        Const initializer of the member.
     DIDerivedType *createStaticMemberType(DIScope *Scope, StringRef Name,
                                           DIFile *File, unsigned LineNo,
                                           DIType *Ty, unsigned Flags,
                                           llvm::Constant *Val);
 
-    /// createObjCIVar - Create debugging information entry for Objective-C
+    /// Create debugging information entry for Objective-C
     /// instance variable.
-    /// @param Name         Member name.
-    /// @param File         File where this member is defined.
-    /// @param LineNo       Line number.
-    /// @param SizeInBits   Member size.
-    /// @param AlignInBits  Member alignment.
-    /// @param OffsetInBits Member offset.
-    /// @param Flags        Flags to encode member attribute, e.g. private
-    /// @param Ty           Parent type.
-    /// @param PropertyNode Property associated with this ivar.
+    /// \param Name         Member name.
+    /// \param File         File where this member is defined.
+    /// \param LineNo       Line number.
+    /// \param SizeInBits   Member size.
+    /// \param AlignInBits  Member alignment.
+    /// \param OffsetInBits Member offset.
+    /// \param Flags        Flags to encode member attribute, e.g. private
+    /// \param Ty           Parent type.
+    /// \param PropertyNode Property associated with this ivar.
     DIDerivedType *createObjCIVar(StringRef Name, DIFile *File, unsigned LineNo,
                                   uint64_t SizeInBits, uint64_t AlignInBits,
                                   uint64_t OffsetInBits, unsigned Flags,
                                   DIType *Ty, MDNode *PropertyNode);
 
-    /// createObjCProperty - Create debugging information entry for Objective-C
+    /// Create debugging information entry for Objective-C
     /// property.
-    /// @param Name         Property name.
-    /// @param File         File where this property is defined.
-    /// @param LineNumber   Line number.
-    /// @param GetterName   Name of the Objective C property getter selector.
-    /// @param SetterName   Name of the Objective C property setter selector.
-    /// @param PropertyAttributes Objective C property attributes.
-    /// @param Ty           Type.
+    /// \param Name         Property name.
+    /// \param File         File where this property is defined.
+    /// \param LineNumber   Line number.
+    /// \param GetterName   Name of the Objective C property getter selector.
+    /// \param SetterName   Name of the Objective C property setter selector.
+    /// \param PropertyAttributes Objective C property attributes.
+    /// \param Ty           Type.
     DIObjCProperty *createObjCProperty(StringRef Name, DIFile *File,
                                        unsigned LineNumber,
                                        StringRef GetterName,
                                        StringRef SetterName,
                                        unsigned PropertyAttributes, DIType *Ty);
 
-    /// createClassType - Create debugging information entry for a class.
-    /// @param Scope        Scope in which this class is defined.
-    /// @param Name         class name.
-    /// @param File         File where this member is defined.
-    /// @param LineNumber   Line number.
-    /// @param SizeInBits   Member size.
-    /// @param AlignInBits  Member alignment.
-    /// @param OffsetInBits Member offset.
-    /// @param Flags        Flags to encode member attribute, e.g. private
-    /// @param Elements     class members.
-    /// @param VTableHolder Debug info of the base class that contains vtable
+    /// Create debugging information entry for a class.
+    /// \param Scope        Scope in which this class is defined.
+    /// \param Name         class name.
+    /// \param File         File where this member is defined.
+    /// \param LineNumber   Line number.
+    /// \param SizeInBits   Member size.
+    /// \param AlignInBits  Member alignment.
+    /// \param OffsetInBits Member offset.
+    /// \param Flags        Flags to encode member attribute, e.g. private
+    /// \param Elements     class members.
+    /// \param VTableHolder Debug info of the base class that contains vtable
     ///                     for this type. This is used in
     ///                     DW_AT_containing_type. See DWARF documentation
     ///                     for more info.
-    /// @param TemplateParms Template type parameters.
-    /// @param UniqueIdentifier A unique identifier for the class.
+    /// \param TemplateParms Template type parameters.
+    /// \param UniqueIdentifier A unique identifier for the class.
     DICompositeType *createClassType(DIScope *Scope, StringRef Name,
                                      DIFile *File, unsigned LineNumber,
                                      uint64_t SizeInBits, uint64_t AlignInBits,
@@ -265,34 +268,34 @@ namespace llvm {
                                      MDNode *TemplateParms = nullptr,
                                      StringRef UniqueIdentifier = "");
 
-    /// createStructType - Create debugging information entry for a struct.
-    /// @param Scope        Scope in which this struct is defined.
-    /// @param Name         Struct name.
-    /// @param File         File where this member is defined.
-    /// @param LineNumber   Line number.
-    /// @param SizeInBits   Member size.
-    /// @param AlignInBits  Member alignment.
-    /// @param Flags        Flags to encode member attribute, e.g. private
-    /// @param Elements     Struct elements.
-    /// @param RunTimeLang  Optional parameter, Objective-C runtime version.
-    /// @param UniqueIdentifier A unique identifier for the struct.
+    /// Create debugging information entry for a struct.
+    /// \param Scope        Scope in which this struct is defined.
+    /// \param Name         Struct name.
+    /// \param File         File where this member is defined.
+    /// \param LineNumber   Line number.
+    /// \param SizeInBits   Member size.
+    /// \param AlignInBits  Member alignment.
+    /// \param Flags        Flags to encode member attribute, e.g. private
+    /// \param Elements     Struct elements.
+    /// \param RunTimeLang  Optional parameter, Objective-C runtime version.
+    /// \param UniqueIdentifier A unique identifier for the struct.
     DICompositeType *createStructType(
         DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
         uint64_t SizeInBits, uint64_t AlignInBits, unsigned Flags,
         DIType *DerivedFrom, DINodeArray Elements, unsigned RunTimeLang = 0,
         DIType *VTableHolder = nullptr, StringRef UniqueIdentifier = "");
 
-    /// createUnionType - Create debugging information entry for an union.
-    /// @param Scope        Scope in which this union is defined.
-    /// @param Name         Union name.
-    /// @param File         File where this member is defined.
-    /// @param LineNumber   Line number.
-    /// @param SizeInBits   Member size.
-    /// @param AlignInBits  Member alignment.
-    /// @param Flags        Flags to encode member attribute, e.g. private
-    /// @param Elements     Union elements.
-    /// @param RunTimeLang  Optional parameter, Objective-C runtime version.
-    /// @param UniqueIdentifier A unique identifier for the union.
+    /// Create debugging information entry for an union.
+    /// \param Scope        Scope in which this union is defined.
+    /// \param Name         Union name.
+    /// \param File         File where this member is defined.
+    /// \param LineNumber   Line number.
+    /// \param SizeInBits   Member size.
+    /// \param AlignInBits  Member alignment.
+    /// \param Flags        Flags to encode member attribute, e.g. private
+    /// \param Elements     Union elements.
+    /// \param RunTimeLang  Optional parameter, Objective-C runtime version.
+    /// \param UniqueIdentifier A unique identifier for the union.
     DICompositeType *createUnionType(DIScope *Scope, StringRef Name,
                                      DIFile *File, unsigned LineNumber,
                                      uint64_t SizeInBits, uint64_t AlignInBits,
@@ -300,95 +303,95 @@ namespace llvm {
                                      unsigned RunTimeLang = 0,
                                      StringRef UniqueIdentifier = "");
 
-    /// createTemplateTypeParameter - Create debugging information for template
+    /// Create debugging information for template
     /// type parameter.
-    /// @param Scope        Scope in which this type is defined.
-    /// @param Name         Type parameter name.
-    /// @param Ty           Parameter type.
+    /// \param Scope        Scope in which this type is defined.
+    /// \param Name         Type parameter name.
+    /// \param Ty           Parameter type.
     DITemplateTypeParameter *
     createTemplateTypeParameter(DIScope *Scope, StringRef Name, DIType *Ty);
 
-    /// createTemplateValueParameter - Create debugging information for template
+    /// Create debugging information for template
     /// value parameter.
-    /// @param Scope        Scope in which this type is defined.
-    /// @param Name         Value parameter name.
-    /// @param Ty           Parameter type.
-    /// @param Val          Constant parameter value.
+    /// \param Scope        Scope in which this type is defined.
+    /// \param Name         Value parameter name.
+    /// \param Ty           Parameter type.
+    /// \param Val          Constant parameter value.
     DITemplateValueParameter *createTemplateValueParameter(DIScope *Scope,
                                                            StringRef Name,
                                                            DIType *Ty,
                                                            Constant *Val);
 
-    /// \brief Create debugging information for a template template parameter.
-    /// @param Scope        Scope in which this type is defined.
-    /// @param Name         Value parameter name.
-    /// @param Ty           Parameter type.
-    /// @param Val          The fully qualified name of the template.
+    /// Create debugging information for a template template parameter.
+    /// \param Scope        Scope in which this type is defined.
+    /// \param Name         Value parameter name.
+    /// \param Ty           Parameter type.
+    /// \param Val          The fully qualified name of the template.
     DITemplateValueParameter *createTemplateTemplateParameter(DIScope *Scope,
                                                               StringRef Name,
                                                               DIType *Ty,
                                                               StringRef Val);
 
-    /// \brief Create debugging information for a template parameter pack.
-    /// @param Scope        Scope in which this type is defined.
-    /// @param Name         Value parameter name.
-    /// @param Ty           Parameter type.
-    /// @param Val          An array of types in the pack.
+    /// Create debugging information for a template parameter pack.
+    /// \param Scope        Scope in which this type is defined.
+    /// \param Name         Value parameter name.
+    /// \param Ty           Parameter type.
+    /// \param Val          An array of types in the pack.
     DITemplateValueParameter *createTemplateParameterPack(DIScope *Scope,
                                                           StringRef Name,
                                                           DIType *Ty,
                                                           DINodeArray Val);
 
-    /// createArrayType - Create debugging information entry for an array.
-    /// @param Size         Array size.
-    /// @param AlignInBits  Alignment.
-    /// @param Ty           Element type.
-    /// @param Subscripts   Subscripts.
+    /// Create debugging information entry for an array.
+    /// \param Size         Array size.
+    /// \param AlignInBits  Alignment.
+    /// \param Ty           Element type.
+    /// \param Subscripts   Subscripts.
     DICompositeType *createArrayType(uint64_t Size, uint64_t AlignInBits,
                                      DIType *Ty, DINodeArray Subscripts);
 
-    /// createVectorType - Create debugging information entry for a vector type.
-    /// @param Size         Array size.
-    /// @param AlignInBits  Alignment.
-    /// @param Ty           Element type.
-    /// @param Subscripts   Subscripts.
+    /// Create debugging information entry for a vector type.
+    /// \param Size         Array size.
+    /// \param AlignInBits  Alignment.
+    /// \param Ty           Element type.
+    /// \param Subscripts   Subscripts.
     DICompositeType *createVectorType(uint64_t Size, uint64_t AlignInBits,
                                       DIType *Ty, DINodeArray Subscripts);
 
-    /// createEnumerationType - Create debugging information entry for an
+    /// Create debugging information entry for an
     /// enumeration.
-    /// @param Scope          Scope in which this enumeration is defined.
-    /// @param Name           Union name.
-    /// @param File           File where this member is defined.
-    /// @param LineNumber     Line number.
-    /// @param SizeInBits     Member size.
-    /// @param AlignInBits    Member alignment.
-    /// @param Elements       Enumeration elements.
-    /// @param UnderlyingType Underlying type of a C++11/ObjC fixed enum.
-    /// @param UniqueIdentifier A unique identifier for the enum.
+    /// \param Scope          Scope in which this enumeration is defined.
+    /// \param Name           Union name.
+    /// \param File           File where this member is defined.
+    /// \param LineNumber     Line number.
+    /// \param SizeInBits     Member size.
+    /// \param AlignInBits    Member alignment.
+    /// \param Elements       Enumeration elements.
+    /// \param UnderlyingType Underlying type of a C++11/ObjC fixed enum.
+    /// \param UniqueIdentifier A unique identifier for the enum.
     DICompositeType *createEnumerationType(
         DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
         uint64_t SizeInBits, uint64_t AlignInBits, DINodeArray Elements,
         DIType *UnderlyingType, StringRef UniqueIdentifier = "");
 
-    /// createSubroutineType - Create subroutine type.
-    /// @param File            File in which this subroutine is defined.
-    /// @param ParameterTypes  An array of subroutine parameter types. This
+    /// Create subroutine type.
+    /// \param File            File in which this subroutine is defined.
+    /// \param ParameterTypes  An array of subroutine parameter types. This
     ///                        includes return type at 0th index.
-    /// @param Flags           E.g.: LValueReference.
+    /// \param Flags           E.g.: LValueReference.
     ///                        These flags are used to emit dwarf attributes.
     DISubroutineType *createSubroutineType(DIFile *File,
                                            DITypeRefArray ParameterTypes,
                                            unsigned Flags = 0);
 
-    /// createArtificialType - Create a new DIType* with "artificial" flag set.
+    /// Create a new DIType* with "artificial" flag set.
     DIType *createArtificialType(DIType *Ty);
 
-    /// createObjectPointerType - Create a new DIType* with the "object pointer"
+    /// Create a new DIType* with the "object pointer"
     /// flag set.
     DIType *createObjectPointerType(DIType *Ty);
 
-    /// \brief Create a permanent forward-declared type.
+    /// Create a permanent forward-declared type.
     DICompositeType *createForwardDecl(unsigned Tag, StringRef Name,
                                        DIScope *Scope, DIFile *F, unsigned Line,
                                        unsigned RuntimeLang = 0,
@@ -396,43 +399,43 @@ namespace llvm {
                                        uint64_t AlignInBits = 0,
                                        StringRef UniqueIdentifier = "");
 
-    /// \brief Create a temporary forward-declared type.
+    /// Create a temporary forward-declared type.
     DICompositeType *createReplaceableCompositeType(
         unsigned Tag, StringRef Name, DIScope *Scope, DIFile *F, unsigned Line,
         unsigned RuntimeLang = 0, uint64_t SizeInBits = 0,
         uint64_t AlignInBits = 0, unsigned Flags = DINode::FlagFwdDecl,
         StringRef UniqueIdentifier = "");
 
-    /// retainType - Retain DIType* in a module even if it is not referenced
+    /// Retain DIType* in a module even if it is not referenced
     /// through debug info anchors.
     void retainType(DIType *T);
 
-    /// createUnspecifiedParameter - Create unspecified parameter type
+    /// Create unspecified parameter type
     /// for a subroutine type.
     DIBasicType *createUnspecifiedParameter();
 
-    /// getOrCreateArray - Get a DINodeArray, create one if required.
+    /// Get a DINodeArray, create one if required.
     DINodeArray getOrCreateArray(ArrayRef<Metadata *> Elements);
 
-    /// getOrCreateTypeArray - Get a DITypeRefArray, create one if required.
+    /// Get a DITypeRefArray, create one if required.
     DITypeRefArray getOrCreateTypeArray(ArrayRef<Metadata *> Elements);
 
-    /// getOrCreateSubrange - Create a descriptor for a value range.  This
+    /// Create a descriptor for a value range.  This
     /// implicitly uniques the values returned.
     DISubrange *getOrCreateSubrange(int64_t Lo, int64_t Count);
 
-    /// createGlobalVariable - Create a new descriptor for the specified
+    /// Create a new descriptor for the specified
     /// variable.
-    /// @param Context     Variable scope.
-    /// @param Name        Name of the variable.
-    /// @param LinkageName Mangled  name of the variable.
-    /// @param File        File where this variable is defined.
-    /// @param LineNo      Line number.
-    /// @param Ty          Variable Type.
-    /// @param isLocalToUnit Boolean flag indicate whether this variable is
+    /// \param Context     Variable scope.
+    /// \param Name        Name of the variable.
+    /// \param LinkageName Mangled  name of the variable.
+    /// \param File        File where this variable is defined.
+    /// \param LineNo      Line number.
+    /// \param Ty          Variable Type.
+    /// \param isLocalToUnit Boolean flag indicate whether this variable is
     ///                      externally visible or not.
-    /// @param Val         llvm::Value of the variable.
-    /// @param Decl        Reference to the corresponding declaration.
+    /// \param Val         llvm::Value of the variable.
+    /// \param Decl        Reference to the corresponding declaration.
     DIGlobalVariable *createGlobalVariable(DIScope *Context, StringRef Name,
                                            StringRef LinkageName, DIFile *File,
                                            unsigned LineNo, DIType *Ty,
@@ -440,26 +443,26 @@ namespace llvm {
                                            llvm::Constant *Val,
                                            MDNode *Decl = nullptr);
 
-    /// createTempGlobalVariableFwdDecl - Identical to createGlobalVariable
+    /// Identical to createGlobalVariable
     /// except that the resulting DbgNode is temporary and meant to be RAUWed.
     DIGlobalVariable *createTempGlobalVariableFwdDecl(
         DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File,
         unsigned LineNo, DIType *Ty, bool isLocalToUnit, llvm::Constant *Val,
         MDNode *Decl = nullptr);
 
-    /// createLocalVariable - Create a new descriptor for the specified
+    /// Create a new descriptor for the specified
     /// local variable.
-    /// @param Tag         Dwarf TAG. Usually DW_TAG_auto_variable or
+    /// \param Tag         Dwarf TAG. Usually DW_TAG_auto_variable or
     ///                    DW_TAG_arg_variable.
-    /// @param Scope       Variable scope.
-    /// @param Name        Variable name.
-    /// @param File        File where this variable is defined.
-    /// @param LineNo      Line number.
-    /// @param Ty          Variable Type
-    /// @param AlwaysPreserve Boolean. Set to true if debug info for this
+    /// \param Scope       Variable scope.
+    /// \param Name        Variable name.
+    /// \param File        File where this variable is defined.
+    /// \param LineNo      Line number.
+    /// \param Ty          Variable Type
+    /// \param AlwaysPreserve Boolean. Set to true if debug info for this
     ///                       variable should be preserved in optimized build.
-    /// @param Flags       Flags, e.g. artificial variable.
-    /// @param ArgNo       If this variable is an argument then this argument's
+    /// \param Flags       Flags, e.g. artificial variable.
+    /// \param ArgNo       If this variable is an argument then this argument's
     ///                    number. 1 indicates 1st argument.
     DILocalVariable *createLocalVariable(unsigned Tag, DIScope *Scope,
                                          StringRef Name, DIFile *File,
@@ -468,36 +471,36 @@ namespace llvm {
                                          unsigned Flags = 0,
                                          unsigned ArgNo = 0);
 
-    /// createExpression - Create a new descriptor for the specified
+    /// Create a new descriptor for the specified
     /// variable which has a complex address expression for its address.
-    /// @param Addr        An array of complex address operations.
+    /// \param Addr        An array of complex address operations.
     DIExpression *createExpression(ArrayRef<uint64_t> Addr = None);
     DIExpression *createExpression(ArrayRef<int64_t> Addr);
 
-    /// createBitPieceExpression - Create a descriptor to describe one part
+    /// Create a descriptor to describe one part
     /// of aggregate variable that is fragmented across multiple Values.
     ///
-    /// @param OffsetInBits Offset of the piece in bits.
-    /// @param SizeInBits   Size of the piece in bits.
+    /// \param OffsetInBits Offset of the piece in bits.
+    /// \param SizeInBits   Size of the piece in bits.
     DIExpression *createBitPieceExpression(unsigned OffsetInBits,
                                            unsigned SizeInBits);
 
-    /// createFunction - Create a new descriptor for the specified subprogram.
+    /// Create a new descriptor for the specified subprogram.
     /// See comments in DISubprogram* for descriptions of these fields.
-    /// @param Scope         Function scope.
-    /// @param Name          Function name.
-    /// @param LinkageName   Mangled function name.
-    /// @param File          File where this variable is defined.
-    /// @param LineNo        Line number.
-    /// @param Ty            Function type.
-    /// @param isLocalToUnit True if this function is not externally visible.
-    /// @param isDefinition  True if this is a function definition.
-    /// @param ScopeLine     Set to the beginning of the scope this starts
-    /// @param Flags         e.g. is this function prototyped or not.
+    /// \param Scope         Function scope.
+    /// \param Name          Function name.
+    /// \param LinkageName   Mangled function name.
+    /// \param File          File where this variable is defined.
+    /// \param LineNo        Line number.
+    /// \param Ty            Function type.
+    /// \param isLocalToUnit True if this function is not externally visible.
+    /// \param isDefinition  True if this is a function definition.
+    /// \param ScopeLine     Set to the beginning of the scope this starts
+    /// \param Flags         e.g. is this function prototyped or not.
     ///                      These flags are used to emit dwarf attributes.
-    /// @param isOptimized   True if optimization is ON.
-    /// @param Fn            llvm::Function pointer.
-    /// @param TParam        Function template parameters.
+    /// \param isOptimized   True if optimization is ON.
+    /// \param Fn            llvm::Function pointer.
+    /// \param TParam        Function template parameters.
     DISubprogram *
     createFunction(DIScope *Scope, StringRef Name, StringRef LinkageName,
                    DIFile *File, unsigned LineNo, DISubroutineType *Ty,
@@ -506,7 +509,7 @@ namespace llvm {
                    Function *Fn = nullptr, MDNode *TParam = nullptr,
                    MDNode *Decl = nullptr);
 
-    /// createTempFunctionFwdDecl - Identical to createFunction,
+    /// Identical to createFunction,
     /// except that the resulting DbgNode is meant to be RAUWed.
     DISubprogram *createTempFunctionFwdDecl(
         DIScope *Scope, StringRef Name, StringRef LinkageName, DIFile *File,
@@ -525,25 +528,25 @@ namespace llvm {
                    Function *Fn = nullptr, MDNode *TParam = nullptr,
                    MDNode *Decl = nullptr);
 
-    /// createMethod - Create a new descriptor for the specified C++ method.
-    /// See comments in DISubprogram* for descriptions of these fields.
-    /// @param Scope         Function scope.
-    /// @param Name          Function name.
-    /// @param LinkageName   Mangled function name.
-    /// @param File          File where this variable is defined.
-    /// @param LineNo        Line number.
-    /// @param Ty            Function type.
-    /// @param isLocalToUnit True if this function is not externally visible..
-    /// @param isDefinition  True if this is a function definition.
-    /// @param Virtuality    Attributes describing virtualness. e.g. pure
+    /// Create a new descriptor for the specified C++ method.
+    /// See comments in \a DISubprogram* for descriptions of these fields.
+    /// \param Scope         Function scope.
+    /// \param Name          Function name.
+    /// \param LinkageName   Mangled function name.
+    /// \param File          File where this variable is defined.
+    /// \param LineNo        Line number.
+    /// \param Ty            Function type.
+    /// \param isLocalToUnit True if this function is not externally visible..
+    /// \param isDefinition  True if this is a function definition.
+    /// \param Virtuality    Attributes describing virtualness. e.g. pure
     ///                      virtual function.
-    /// @param VTableIndex   Index no of this method in virtual table.
-    /// @param VTableHolder  Type that holds vtable.
-    /// @param Flags         e.g. is this function prototyped or not.
+    /// \param VTableIndex   Index no of this method in virtual table.
+    /// \param VTableHolder  Type that holds vtable.
+    /// \param Flags         e.g. is this function prototyped or not.
     ///                      This flags are used to emit dwarf attributes.
-    /// @param isOptimized   True if optimization is ON.
-    /// @param Fn            llvm::Function pointer.
-    /// @param TParam        Function template parameters.
+    /// \param isOptimized   True if optimization is ON.
+    /// \param Fn            llvm::Function pointer.
+    /// \param TParam        Function template parameters.
     DISubprogram *
     createMethod(DIScope *Scope, StringRef Name, StringRef LinkageName,
                  DIFile *File, unsigned LineNo, DISubroutineType *Ty,
@@ -552,131 +555,131 @@ namespace llvm {
                  unsigned Flags = 0, bool isOptimized = false,
                  Function *Fn = nullptr, MDNode *TParam = nullptr);
 
-    /// createNameSpace - This creates new descriptor for a namespace
-    /// with the specified parent scope.
-    /// @param Scope       Namespace scope
-    /// @param Name        Name of this namespace
-    /// @param File        Source file
-    /// @param LineNo      Line number
+    /// This creates new descriptor for a namespace with the specified
+    /// parent scope.
+    /// \param Scope       Namespace scope
+    /// \param Name        Name of this namespace
+    /// \param File        Source file
+    /// \param LineNo      Line number
     DINamespace *createNameSpace(DIScope *Scope, StringRef Name, DIFile *File,
                                  unsigned LineNo);
 
-    /// createModule - This creates new descriptor for a module
-    /// with the specified parent scope.
-    /// @param Scope       Parent scope
-    /// @param Name        Name of this module
-    /// @param ConfigurationMacros
+    /// This creates new descriptor for a module with the specified
+    /// parent scope.
+    /// \param Scope       Parent scope
+    /// \param Name        Name of this module
+    /// \param ConfigurationMacros
     ///                    A space-separated shell-quoted list of -D macro
     ///                    definitions as they would appear on a command line.
-    /// @param IncludePath The path to the module map file.
-    /// @param ISysRoot    The clang system root (value of -isysroot).
+    /// \param IncludePath The path to the module map file.
+    /// \param ISysRoot    The clang system root (value of -isysroot).
     DIModule *createModule(DIScope *Scope, StringRef Name,
                            StringRef ConfigurationMacros,
                            StringRef IncludePath,
                            StringRef ISysRoot);
 
-    /// createLexicalBlockFile - This creates a descriptor for a lexical
-    /// block with a new file attached. This merely extends the existing
+    /// This creates a descriptor for a lexical block with a new file
+    /// attached. This merely extends the existing
     /// lexical block as it crosses a file.
-    /// @param Scope       Lexical block.
-    /// @param File        Source file.
-    /// @param Discriminator DWARF path discriminator value.
+    /// \param Scope       Lexical block.
+    /// \param File        Source file.
+    /// \param Discriminator DWARF path discriminator value.
     DILexicalBlockFile *createLexicalBlockFile(DIScope *Scope, DIFile *File,
                                                unsigned Discriminator = 0);
 
-    /// createLexicalBlock - This creates a descriptor for a lexical block
-    /// with the specified parent context.
-    /// @param Scope         Parent lexical scope.
-    /// @param File          Source file.
-    /// @param Line          Line number.
-    /// @param Col           Column number.
+    /// This creates a descriptor for a lexical block with the
+    /// specified parent context.
+    /// \param Scope         Parent lexical scope.
+    /// \param File          Source file.
+    /// \param Line          Line number.
+    /// \param Col           Column number.
     DILexicalBlock *createLexicalBlock(DIScope *Scope, DIFile *File,
                                        unsigned Line, unsigned Col);
 
-    /// \brief Create a descriptor for an imported module.
-    /// @param Context The scope this module is imported into
-    /// @param NS The namespace being imported here
-    /// @param Line Line number
+    /// Create a descriptor for an imported module.
+    /// \param Context The scope this module is imported into
+    /// \param NS The namespace being imported here
+    /// \param Line Line number
     DIImportedEntity *createImportedModule(DIScope *Context, DINamespace *NS,
                                            unsigned Line);
 
-    /// \brief Create a descriptor for an imported module.
-    /// @param Context The scope this module is imported into
-    /// @param NS An aliased namespace
-    /// @param Line Line number
+    /// Create a descriptor for an imported module.
+    /// \param Context The scope this module is imported into
+    /// \param NS An aliased namespace
+    /// \param Line Line number
     DIImportedEntity *createImportedModule(DIScope *Context,
                                            DIImportedEntity *NS, unsigned Line);
 
-    /// \brief Create a descriptor for an imported module.
-    /// @param Context The scope this module is imported into
-    /// @param M The module being imported here
-    /// @param Line Line number
+    /// Create a descriptor for an imported module.
+    /// \param Context The scope this module is imported into
+    /// \param M The module being imported here
+    /// \param Line Line number
     DIImportedEntity *createImportedModule(DIScope *Context, DIModule *M,
                                            unsigned Line);
 
-    /// \brief Create a descriptor for an imported function.
-    /// @param Context The scope this module is imported into
-    /// @param Decl The declaration (or definition) of a function, type, or
+    /// Create a descriptor for an imported function.
+    /// \param Context The scope this module is imported into
+    /// \param Decl The declaration (or definition) of a function, type, or
     ///             variable
-    /// @param Line Line number
+    /// \param Line Line number
     DIImportedEntity *createImportedDeclaration(DIScope *Context, DINode *Decl,
                                                 unsigned Line,
                                                 StringRef Name = "");
 
-    /// insertDeclare - Insert a new llvm.dbg.declare intrinsic call.
-    /// @param Storage     llvm::Value of the variable
-    /// @param VarInfo     Variable's debug info descriptor.
-    /// @param Expr         A complex location expression.
-    /// @param DL           Debug info location.
-    /// @param InsertAtEnd Location for the new intrinsic.
+    /// Insert a new llvm.dbg.declare intrinsic call.
+    /// \param Storage     llvm::Value of the variable
+    /// \param VarInfo     Variable's debug info descriptor.
+    /// \param Expr        A complex location expression.
+    /// \param DL          Debug info location.
+    /// \param InsertAtEnd Location for the new intrinsic.
     Instruction *insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
                                DIExpression *Expr, const DILocation *DL,
                                BasicBlock *InsertAtEnd);
 
-    /// insertDeclare - Insert a new llvm.dbg.declare intrinsic call.
-    /// @param Storage      llvm::Value of the variable
-    /// @param VarInfo      Variable's debug info descriptor.
-    /// @param Expr         A complex location expression.
-    /// @param DL           Debug info location.
-    /// @param InsertBefore Location for the new intrinsic.
+    /// Insert a new llvm.dbg.declare intrinsic call.
+    /// \param Storage      llvm::Value of the variable
+    /// \param VarInfo      Variable's debug info descriptor.
+    /// \param Expr         A complex location expression.
+    /// \param DL           Debug info location.
+    /// \param InsertBefore Location for the new intrinsic.
     Instruction *insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
                                DIExpression *Expr, const DILocation *DL,
                                Instruction *InsertBefore);
 
-    /// insertDbgValueIntrinsic - Insert a new llvm.dbg.value intrinsic call.
-    /// @param Val          llvm::Value of the variable
-    /// @param Offset       Offset
-    /// @param VarInfo      Variable's debug info descriptor.
-    /// @param Expr         A complex location expression.
-    /// @param DL           Debug info location.
-    /// @param InsertAtEnd Location for the new intrinsic.
+    /// Insert a new llvm.dbg.value intrinsic call.
+    /// \param Val          llvm::Value of the variable
+    /// \param Offset       Offset
+    /// \param VarInfo      Variable's debug info descriptor.
+    /// \param Expr         A complex location expression.
+    /// \param DL           Debug info location.
+    /// \param InsertAtEnd Location for the new intrinsic.
     Instruction *insertDbgValueIntrinsic(llvm::Value *Val, uint64_t Offset,
                                          DILocalVariable *VarInfo,
                                          DIExpression *Expr,
                                          const DILocation *DL,
                                          BasicBlock *InsertAtEnd);
 
-    /// insertDbgValueIntrinsic - Insert a new llvm.dbg.value intrinsic call.
-    /// @param Val          llvm::Value of the variable
-    /// @param Offset       Offset
-    /// @param VarInfo      Variable's debug info descriptor.
-    /// @param Expr         A complex location expression.
-    /// @param DL           Debug info location.
-    /// @param InsertBefore Location for the new intrinsic.
+    /// Insert a new llvm.dbg.value intrinsic call.
+    /// \param Val          llvm::Value of the variable
+    /// \param Offset       Offset
+    /// \param VarInfo      Variable's debug info descriptor.
+    /// \param Expr         A complex location expression.
+    /// \param DL           Debug info location.
+    /// \param InsertBefore Location for the new intrinsic.
     Instruction *insertDbgValueIntrinsic(llvm::Value *Val, uint64_t Offset,
                                          DILocalVariable *VarInfo,
                                          DIExpression *Expr,
                                          const DILocation *DL,
                                          Instruction *InsertBefore);
 
-    /// \brief Replace the vtable holder in the given composite type.
+    /// Replace the vtable holder in the given composite type.
     ///
     /// If this creates a self reference, it may orphan some unresolved cycles
     /// in the operands of \c T, so \a DIBuilder needs to track that.
     void replaceVTableHolder(DICompositeType *&T,
                              DICompositeType *VTableHolder);
 
-    /// \brief Replace arrays on a composite type.
+    /// Replace arrays on a composite type.
     ///
     /// If \c T is resolved, but the arrays aren't -- which can happen if \c T
     /// has a self-reference -- \a DIBuilder needs to track the array to
@@ -684,7 +687,7 @@ namespace llvm {
     void replaceArrays(DICompositeType *&T, DINodeArray Elements,
                        DINodeArray TParems = DINodeArray());
 
-    /// \brief Replace a temporary node.
+    /// Replace a temporary node.
     ///
     /// Call \a MDNode::replaceAllUsesWith() on \c N, replacing it with \c
     /// Replacement.
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 5c99300c35c7..9c5a95721d79 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -1085,10 +1085,10 @@ public:
   /// deleted on a uniquing collision.  In practice, uniquing collisions on \a
   /// DICompileUnit should be fairly rare.
   /// @{
-  void replaceEnumTypes(DISubprogramArray N) {
+  void replaceEnumTypes(DICompositeTypeArray N) {
     replaceOperandWith(4, N.get());
   }
-  void replaceRetainedTypes(DISubprogramArray N) {
+  void replaceRetainedTypes(DITypeArray N) {
     replaceOperandWith(5, N.get());
   }
   void replaceSubprograms(DISubprogramArray N) {
@@ -1097,7 +1097,7 @@ public:
   void replaceGlobalVariables(DIGlobalVariableArray N) {
     replaceOperandWith(7, N.get());
   }
-  void replaceImportedEntities(DIGlobalVariableArray N) {
+  void replaceImportedEntities(DIImportedEntityArray N) {
     replaceOperandWith(8, N.get());
   }
   /// @}
@@ -1650,14 +1650,14 @@ class DIModule : public DIScope {
                            StorageType Storage, bool ShouldCreate = true) {
     return getImpl(Context, Scope, getCanonicalMDString(Context, Name),
                    getCanonicalMDString(Context, ConfigurationMacros),
-		   getCanonicalMDString(Context, IncludePath),
-		   getCanonicalMDString(Context, ISysRoot),
+                   getCanonicalMDString(Context, IncludePath),
+                   getCanonicalMDString(Context, ISysRoot),
                    Storage, ShouldCreate);
   }
   static DIModule *getImpl(LLVMContext &Context, Metadata *Scope,
                            MDString *Name, MDString *ConfigurationMacros,
-			   MDString *IncludePath, MDString *ISysRoot,
-			   StorageType Storage, bool ShouldCreate = true);
+                           MDString *IncludePath, MDString *ISysRoot,
+                           StorageType Storage, bool ShouldCreate = true);
 
   TempDIModule cloneImpl() const {
     return getTemporary(getContext(), getScope(), getName(),
@@ -1667,12 +1667,12 @@ class DIModule : public DIScope {
 
 public:
   DEFINE_MDNODE_GET(DIModule, (DIScope *Scope, StringRef Name,
-			       StringRef ConfigurationMacros, StringRef IncludePath,
-			       StringRef ISysRoot),
+                               StringRef ConfigurationMacros, StringRef IncludePath,
+                               StringRef ISysRoot),
                     (Scope, Name, ConfigurationMacros, IncludePath, ISysRoot))
   DEFINE_MDNODE_GET(DIModule,
                     (Metadata *Scope, MDString *Name, MDString *ConfigurationMacros,
-		     MDString *IncludePath, MDString *ISysRoot),
+                     MDString *IncludePath, MDString *ISysRoot),
                     (Scope, Name, ConfigurationMacros, IncludePath, ISysRoot))
 
   TempDIModule clone() const { return cloneImpl(); }
diff --git a/include/llvm/IR/Dominators.h b/include/llvm/IR/Dominators.h
index c1f208e3d72f..27d989b0344c 100644
--- a/include/llvm/IR/Dominators.h
+++ b/include/llvm/IR/Dominators.h
@@ -36,18 +36,14 @@ namespace llvm {
 template <typename IRUnitT> class AnalysisManager;
 class PreservedAnalyses;
 
-EXTERN_TEMPLATE_INSTANTIATION(class DomTreeNodeBase<BasicBlock>);
-EXTERN_TEMPLATE_INSTANTIATION(class DominatorTreeBase<BasicBlock>);
-
-#define LLVM_COMMA ,
-EXTERN_TEMPLATE_INSTANTIATION(void Calculate<Function LLVM_COMMA BasicBlock *>(
-    DominatorTreeBase<GraphTraits<BasicBlock *>::NodeType> &DT LLVM_COMMA
-        Function &F));
-EXTERN_TEMPLATE_INSTANTIATION(
-    void Calculate<Function LLVM_COMMA Inverse<BasicBlock *> >(
-        DominatorTreeBase<GraphTraits<Inverse<BasicBlock *> >::NodeType> &DT
-            LLVM_COMMA Function &F));
-#undef LLVM_COMMA
+extern template class DomTreeNodeBase<BasicBlock>;
+extern template class DominatorTreeBase<BasicBlock>;
+
+extern template void Calculate<Function, BasicBlock *>(
+    DominatorTreeBase<GraphTraits<BasicBlock *>::NodeType> &DT, Function &F);
+extern template void Calculate<Function, Inverse<BasicBlock *>>(
+    DominatorTreeBase<GraphTraits<Inverse<BasicBlock *>>::NodeType> &DT,
+    Function &F);
 
 typedef DomTreeNodeBase<BasicBlock> DomTreeNode;
 
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index 02ea056de39b..ec9f4cad094a 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -293,6 +293,16 @@ public:
     addFnAttr(Attribute::ReadOnly);
   }
 
+  /// @brief Determine if the call can access memmory only using pointers based
+  /// on its arguments.
+  bool onlyAccessesArgMemory() const {
+    return AttributeSets.hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::ArgMemOnly);
+  }
+  void setOnlyAccessesArgMemory() {
+    addFnAttr(Attribute::ArgMemOnly);
+  }
+  
   /// @brief Determine if the function cannot return.
   bool doesNotReturn() const {
     return AttributeSets.hasAttribute(AttributeSet::FunctionIndex,
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index f2379705d460..2961369a7327 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -252,10 +252,9 @@ public:
   /// mistake: when working at the IR level use mayBeOverridden instead as it
   /// knows about ODR semantics.
   static bool isWeakForLinker(LinkageTypes Linkage)  {
-    return Linkage == AvailableExternallyLinkage || Linkage == WeakAnyLinkage ||
-           Linkage == WeakODRLinkage || Linkage == LinkOnceAnyLinkage ||
-           Linkage == LinkOnceODRLinkage || Linkage == CommonLinkage ||
-           Linkage == ExternalWeakLinkage;
+    return Linkage == WeakAnyLinkage || Linkage == WeakODRLinkage ||
+           Linkage == LinkOnceAnyLinkage || Linkage == LinkOnceODRLinkage ||
+           Linkage == CommonLinkage || Linkage == ExternalWeakLinkage;
   }
 
   bool hasExternalLinkage() const { return isExternalLinkage(Linkage); }
@@ -349,6 +348,12 @@ public:
     return isDeclaration();
   }
 
+  /// Returns true if this global's definition will be the one chosen by the
+  /// linker.
+  bool isStrongDefinitionForLinker() const {
+    return !(isDeclarationForLinker() || isWeakForLinker());
+  }
+
   /// This method unlinks 'this' from the containing module, but does not delete
   /// it.
   virtual void removeFromParent() = 0;
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index e6b5393c3397..6c67c79b6c0e 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -1382,47 +1382,61 @@ public:
     return CreateICmp(ICmpInst::ICMP_SLE, LHS, RHS, Name);
   }
 
-  Value *CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_OEQ, LHS, RHS, Name);
+  Value *CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_OEQ, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpOGT(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_OGT, LHS, RHS, Name);
+  Value *CreateFCmpOGT(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_OGT, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_OGE, LHS, RHS, Name);
+  Value *CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_OGE, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_OLT, LHS, RHS, Name);
+  Value *CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_OLT, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpOLE(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_OLE, LHS, RHS, Name);
+  Value *CreateFCmpOLE(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_OLE, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpONE(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_ONE, LHS, RHS, Name);
+  Value *CreateFCmpONE(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_ONE, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpORD(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_ORD, LHS, RHS, Name);
+  Value *CreateFCmpORD(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_ORD, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpUNO(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_UNO, LHS, RHS, Name);
+  Value *CreateFCmpUNO(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_UNO, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpUEQ(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_UEQ, LHS, RHS, Name);
+  Value *CreateFCmpUEQ(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_UEQ, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpUGT(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_UGT, LHS, RHS, Name);
+  Value *CreateFCmpUGT(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_UGT, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpUGE(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_UGE, LHS, RHS, Name);
+  Value *CreateFCmpUGE(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_UGE, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpULT(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_ULT, LHS, RHS, Name);
+  Value *CreateFCmpULT(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_ULT, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpULE(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_ULE, LHS, RHS, Name);
+  Value *CreateFCmpULE(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_ULE, LHS, RHS, Name, FPMathTag);
   }
-  Value *CreateFCmpUNE(Value *LHS, Value *RHS, const Twine &Name = "") {
-    return CreateFCmp(FCmpInst::FCMP_UNE, LHS, RHS, Name);
+  Value *CreateFCmpUNE(Value *LHS, Value *RHS, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmp(FCmpInst::FCMP_UNE, LHS, RHS, Name, FPMathTag);
   }
 
   Value *CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS,
@@ -1433,11 +1447,12 @@ public:
     return Insert(new ICmpInst(P, LHS, RHS), Name);
   }
   Value *CreateFCmp(CmpInst::Predicate P, Value *LHS, Value *RHS,
-                    const Twine &Name = "") {
+                    const Twine &Name = "", MDNode *FPMathTag = nullptr) {
     if (Constant *LC = dyn_cast<Constant>(LHS))
       if (Constant *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateFCmp(P, LC, RC), Name);
-    return Insert(new FCmpInst(P, LHS, RHS), Name);
+    return Insert(AddFPMathAttributes(new FCmpInst(P, LHS, RHS),
+                                      FPMathTag, FMF), Name);
   }
 
   //===--------------------------------------------------------------------===//
@@ -1449,7 +1464,7 @@ public:
     return Insert(PHINode::Create(Ty, NumReservedValues), Name);
   }
 
-  CallInst *CreateCall(Value *Callee, ArrayRef<Value *> Args,
+  CallInst *CreateCall(Value *Callee, ArrayRef<Value *> Args = None,
                        const Twine &Name = "") {
     return Insert(CallInst::Create(Callee, Args), Name);
   }
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index 6e3de1f13545..31f363f70a5b 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -382,7 +382,7 @@ public:
   ///
   /// Note that this does not consider malloc and alloca to have side
   /// effects because the newly allocated memory is completely invisible to
-  /// instructions which don't used the returned value.  For cases where this
+  /// instructions which don't use the returned value.  For cases where this
   /// matters, isSafeToSpeculativelyExecute may be more appropriate.
   bool mayHaveSideEffects() const {
     return mayWriteToMemory() || mayThrow() || !mayReturn();
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index c5890f01ea70..07d5f111b9e1 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -990,10 +990,14 @@ public:
                                    Ptr->getType()->getPointerAddressSpace());
     // Vector GEP
     if (Ptr->getType()->isVectorTy()) {
-      unsigned NumElem = cast<VectorType>(Ptr->getType())->getNumElements();
+      unsigned NumElem = Ptr->getType()->getVectorNumElements();
       return VectorType::get(PtrTy, NumElem);
     }
-
+    for (Value *Index : IdxList)
+      if (Index->getType()->isVectorTy()) {
+        unsigned NumElem = Index->getType()->getVectorNumElements();
+        return VectorType::get(PtrTy, NumElem);
+      }
     // Scalar GEP
     return PtrTy;
   }
@@ -1591,6 +1595,15 @@ public:
     addAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly);
   }
 
+  /// @brief Determine if the call can access memmory only using pointers based
+  /// on its arguments.
+  bool onlyAccessesArgMemory() const {
+    return hasFnAttr(Attribute::ArgMemOnly);
+  }
+  void setOnlyAccessesArgMemory() {
+    addAttribute(AttributeSet::FunctionIndex, Attribute::ArgMemOnly);
+  }
+
   /// \brief Determine if the call cannot return.
   bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
   void setDoesNotReturn() {
@@ -3360,6 +3373,15 @@ public:
     addAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly);
   }
 
+  /// @brief Determine if the call access memmory only using it's pointer
+  /// arguments.
+  bool onlyAccessesArgMemory() const {
+    return hasFnAttr(Attribute::ArgMemOnly);
+  }
+  void setOnlyAccessesArgMemory() {
+    addAttribute(AttributeSet::FunctionIndex, Attribute::ArgMemOnly);
+  }
+
   /// \brief Determine if the call cannot return.
   bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
   void setDoesNotReturn() {
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index e6f6d0ffe8b6..bbae720b4e12 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -268,15 +268,23 @@ def int_gcwrite : Intrinsic<[],
 //
 def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_frameaddress  : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_frameescape : Intrinsic<[], [llvm_vararg_ty]>;
-def int_framerecover : Intrinsic<[llvm_ptr_ty],
-                                 [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty],
-                                 [IntrNoMem]>;
 def int_read_register  : Intrinsic<[llvm_anyint_ty], [llvm_metadata_ty],
                                    [IntrReadMem], "llvm.read_register">;
 def int_write_register : Intrinsic<[], [llvm_metadata_ty, llvm_anyint_ty],
                                    [], "llvm.write_register">;
 
+// Gets the address of the local variable area. This is typically a copy of the
+// stack, frame, or base pointer depending on the type of prologue.
+def int_localaddress : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
+
+// Escapes local variables to allow access from other functions.
+def int_localescape : Intrinsic<[], [llvm_vararg_ty]>;
+
+// Given a function and the localaddress of a parent frame, returns a pointer
+// to an escaped allocation indicated by the index.
+def int_localrecover : Intrinsic<[llvm_ptr_ty],
+                                 [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty],
+                                 [IntrNoMem]>;
 // Note: we treat stacksave/stackrestore as writemem because we don't otherwise
 // model their dependencies on allocas.
 def int_stacksave     : Intrinsic<[llvm_ptr_ty]>,
@@ -362,6 +370,8 @@ let Properties = [IntrNoMem] in {
   def int_rint  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_nearbyint : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_round : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_canonicalize : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>],
+                                   [IntrNoMem]>;
 }
 
 // NOTE: these are internal interfaces.
@@ -638,3 +648,4 @@ include "llvm/IR/IntrinsicsMips.td"
 include "llvm/IR/IntrinsicsAMDGPU.td"
 include "llvm/IR/IntrinsicsBPF.td"
 include "llvm/IR/IntrinsicsSystemZ.td"
+include "llvm/IR/IntrinsicsWebAssembly.td"
diff --git a/include/llvm/IR/IntrinsicsPowerPC.td b/include/llvm/IR/IntrinsicsPowerPC.td
index 05adc5a757be..eb8f1e6cd079 100644
--- a/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/include/llvm/IR/IntrinsicsPowerPC.td
@@ -694,6 +694,18 @@ def int_ppc_vsx_xvrspip :
 def int_ppc_vsx_xvrdpip :
       Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
 
+// Vector reciprocal estimate
+def int_ppc_vsx_xvresp : GCCBuiltin<"__builtin_vsx_xvresp">,
+      Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_ppc_vsx_xvredp : GCCBuiltin<"__builtin_vsx_xvredp">,
+      Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+// Vector rsqrte
+def int_ppc_vsx_xvrsqrtesp : GCCBuiltin<"__builtin_vsx_xvrsqrtesp">,
+      Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_ppc_vsx_xvrsqrtedp : GCCBuiltin<"__builtin_vsx_xvrsqrtedp">,
+      Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
 // Vector compare
 def int_ppc_vsx_xvcmpeqdp :
       PowerPC_VSX_Intrinsic<"xvcmpeqdp", [llvm_v2i64_ty],
@@ -713,6 +725,9 @@ def int_ppc_vsx_xvcmpgtdp :
 def int_ppc_vsx_xvcmpgtsp :
       PowerPC_VSX_Intrinsic<"xvcmpgtsp", [llvm_v4i32_ty],
                             [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_ppc_vsx_xxleqv :
+      PowerPC_VSX_Intrinsic<"xxleqv", [llvm_v4i32_ty],
+                            [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/IR/IntrinsicsWebAssembly.td b/include/llvm/IR/IntrinsicsWebAssembly.td
new file mode 100644
index 000000000000..3ccde4742384
--- /dev/null
+++ b/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -0,0 +1,16 @@
+//===- IntrinsicsWebAssembly.td - Defines wasm intrinsics --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines all of the WebAssembly-specific intrinsics.
+///
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "wasm" in {  // All intrinsics start with "llvm.wasm.".
+}
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index b90825db93cd..a3bc4af84308 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -28,7 +28,7 @@ let TargetPrefix = "x86" in {
   def int_x86_seh_restoreframe : Intrinsic<[], [], []>;
 
   // Given a pointer to the end of an EH registration object, returns the true
-  // parent frame address that can be used with llvm.framerecover.
+  // parent frame address that can be used with llvm.localrecover.
   def int_x86_seh_recoverfp : Intrinsic<[llvm_ptr_ty],
                                         [llvm_ptr_ty, llvm_ptr_ty],
                                         [IntrNoMem]>;
@@ -2107,6 +2107,15 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_pmul_hr_sw : GCCBuiltin<"__builtin_ia32_pmulhrsw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
                          llvm_v16i16_ty], [IntrNoMem, Commutative]>;
+  def int_x86_avx512_mask_pmul_hr_sw_128 : GCCBuiltin<"__builtin_ia32_pmulhrsw128_mask">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, 
+                     llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_pmul_hr_sw_256 : GCCBuiltin<"__builtin_ia32_pmulhrsw256_mask">,
+              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, 
+                     llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_pmul_hr_sw_512 : GCCBuiltin<"__builtin_ia32_pmulhrsw512_mask">,
+              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, 
+                     llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
 }
 
 // Vector sign and zero extend
@@ -4466,6 +4475,24 @@ let TargetPrefix = "x86" in {
   def int_x86_avx512_mask_pmull_q_512 : GCCBuiltin<"__builtin_ia32_pmullq512_mask">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
                      llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512_mask">,
+              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
+                     llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_pmulh_w_512 : GCCBuiltin<"__builtin_ia32_pmulhw512_mask">,
+              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
+                     llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_pmulhu_w_128 : GCCBuiltin<"__builtin_ia32_pmulhuw128_mask">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, 
+                     llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_pmulhu_w_256 : GCCBuiltin<"__builtin_ia32_pmulhuw256_mask">,
+              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, 
+                     llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_pmulh_w_128 : GCCBuiltin<"__builtin_ia32_pmulhw128_mask">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, 
+                     llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_pmulh_w_256 : GCCBuiltin<"__builtin_ia32_pmulhw256_mask">,
+              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, 
+                     llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_pavg_b_512 : GCCBuiltin<"__builtin_ia32_pavgb512_mask">,
           Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, 
                     llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
diff --git a/include/llvm/IR/Operator.h b/include/llvm/IR/Operator.h
index 1b9102ecc7e4..372b254ab183 100644
--- a/include/llvm/IR/Operator.h
+++ b/include/llvm/IR/Operator.h
@@ -305,7 +305,8 @@ public:
   float getFPAccuracy() const;
 
   static inline bool classof(const Instruction *I) {
-    return I->getType()->isFPOrFPVectorTy();
+    return I->getType()->isFPOrFPVectorTy() ||
+      I->getOpcode() == Instruction::FCmp;
   }
   static inline bool classof(const Value *V) {
     return isa<Instruction>(V) && classof(cast<Instruction>(V));
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 484afc6d232c..17a80c82d1bc 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -104,8 +104,8 @@ protected:
   ///
   /// Note, this should *NOT* be used directly by any class other than User.
   /// User uses this value to find the Use list.
-  static const unsigned NumUserOperandsBits = 29;
-  unsigned NumUserOperands : 29;
+  enum : unsigned { NumUserOperandsBits = 29 };
+  unsigned NumUserOperands : NumUserOperandsBits;
 
   bool IsUsedByMD : 1;
   bool HasName : 1;
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 74fbc0f94b03..e3b9a95f0a3d 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -130,6 +130,7 @@ void initializeSanitizerCoverageModulePass(PassRegistry&);
 void initializeDataFlowSanitizerPass(PassRegistry&);
 void initializeScalarizerPass(PassRegistry&);
 void initializeEarlyCSELegacyPassPass(PassRegistry &);
+void initializeEliminateAvailableExternallyPass(PassRegistry&);
 void initializeExpandISelPseudosPass(PassRegistry&);
 void initializeFunctionAttrsPass(PassRegistry&);
 void initializeGCMachineCodeAnalysisPass(PassRegistry&);
@@ -302,6 +303,7 @@ void initializePlaceSafepointsPass(PassRegistry&);
 void initializeDwarfEHPreparePass(PassRegistry&);
 void initializeFloat2IntPass(PassRegistry&);
 void initializeLoopDistributePass(PassRegistry&);
+void initializeSjLjEHPreparePass(PassRegistry&);
 }
 
 #endif
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 8ac1b212ae5f..cea5530db3b8 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -176,6 +176,7 @@ namespace {
       (void) llvm::createStraightLineStrengthReducePass();
       (void) llvm::createMemDerefPrinter();
       (void) llvm::createFloat2IntPass();
+      (void) llvm::createEliminateAvailableExternallyPass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::ScalarEvolution();
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 52017fda189b..41169e9a12a0 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -273,7 +273,7 @@ namespace llvm {
     /// Gets a symbol that will be defined to the final stack offset of a local
     /// variable after codegen.
     ///
-    /// \param Idx - The index of a local variable passed to @llvm.frameescape.
+    /// \param Idx - The index of a local variable passed to @llvm.localescape.
     MCSymbol *getOrCreateFrameAllocSymbol(StringRef FuncName, unsigned Idx);
 
     MCSymbol *getOrCreateParentFrameOffsetSymbol(StringRef FuncName);
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index c7bed8eccda9..1e72dfee4ad1 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -54,13 +54,13 @@ struct MCDwarfFile {
 /// \brief Instances of this class represent the information from a
 /// dwarf .loc directive.
 class MCDwarfLoc {
-  unsigned FileNum;
-  unsigned Line;
-  unsigned Column;
+  uint32_t FileNum;
+  uint32_t Line;
+  uint16_t Column;
   // Flags (see #define's below)
-  unsigned Flags;
-  unsigned Isa;
-  unsigned Discriminator;
+  uint8_t Flags;
+  uint8_t Isa;
+  uint32_t Discriminator;
 
 // Flag that indicates the initial value of the is_stmt_start flag.
 #define DWARF2_LINE_DEFAULT_IS_STMT 1
@@ -107,13 +107,22 @@ public:
   void setLine(unsigned line) { Line = line; }
 
   /// \brief Set the Column of this MCDwarfLoc.
-  void setColumn(unsigned column) { Column = column; }
+  void setColumn(unsigned column) {
+    assert(column <= UINT16_MAX);
+    Column = column;
+  }
 
   /// \brief Set the Flags of this MCDwarfLoc.
-  void setFlags(unsigned flags) { Flags = flags; }
+  void setFlags(unsigned flags) {
+    assert(flags <= UINT8_MAX);
+    Flags = flags;
+  }
 
   /// \brief Set the Isa of this MCDwarfLoc.
-  void setIsa(unsigned isa) { Isa = isa; }
+  void setIsa(unsigned isa) {
+    assert(isa <= UINT8_MAX);
+    Isa = isa;
+  }
 
   /// \brief Set the Discriminator of this MCDwarfLoc.
   void setDiscriminator(unsigned discriminator) {
diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h
index 3209a2ce0408..6a582e82d00e 100644
--- a/include/llvm/MC/MCInstrDesc.h
+++ b/include/llvm/MC/MCInstrDesc.h
@@ -154,7 +154,8 @@ public:
 
   // A complex method to determine is a certain is deprecated or not, and return
   // the reason for deprecation.
-  bool (*ComplexDeprecationInfo)(MCInst &, MCSubtargetInfo &, std::string &);
+  bool (*ComplexDeprecationInfo)(MCInst &, const MCSubtargetInfo &,
+                                 std::string &);
 
   /// \brief Returns the value of the specific constraint if
   /// it is set. Returns -1 if it is not set.
@@ -170,7 +171,7 @@ public:
 
   /// \brief Returns true if a certain instruction is deprecated and if so
   /// returns the reason in \p Info.
-  bool getDeprecatedInfo(MCInst &MI, MCSubtargetInfo &STI,
+  bool getDeprecatedInfo(MCInst &MI, const MCSubtargetInfo &STI,
                          std::string &Info) const;
 
   /// \brief Return the opcode number for this descriptor.
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index 1adfedd2638a..c09791631056 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -224,25 +224,9 @@ struct MCSchedModel {
     return &SchedClassTable[SchedClassIdx];
   }
 
-  // /\brief Returns a default initialized model. Used for unknown processors.
-  static MCSchedModel GetDefaultSchedModel() {
-    MCSchedModel Ret = { DefaultIssueWidth,
-                         DefaultMicroOpBufferSize,
-                         DefaultLoopMicroOpBufferSize,
-                         DefaultLoadLatency,
-                         DefaultHighLatency,
-                         DefaultMispredictPenalty,
-                         false,
-                         true,
-                         0,
-                         nullptr,
-                         nullptr,
-                         0,
-                         0,
-                         nullptr
-                       };
-    return Ret;
-  }
+  /// Returns the default initialized model.
+  static const MCSchedModel &GetDefaultSchedModel() { return Default; }
+  static const MCSchedModel Default;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCSubtargetInfo.h b/include/llvm/MC/MCSubtargetInfo.h
index b8ad02fbe696..d5ad4eebf9ef 100644
--- a/include/llvm/MC/MCSubtargetInfo.h
+++ b/include/llvm/MC/MCSubtargetInfo.h
@@ -37,22 +37,26 @@ class MCSubtargetInfo {
   const MCWriteProcResEntry *WriteProcResTable;
   const MCWriteLatencyEntry *WriteLatencyTable;
   const MCReadAdvanceEntry *ReadAdvanceTable;
-  MCSchedModel CPUSchedModel;
+  const MCSchedModel *CPUSchedModel;
 
   const InstrStage *Stages;            // Instruction itinerary stages
   const unsigned *OperandCycles;       // Itinerary operand cycles
   const unsigned *ForwardingPaths;     // Forwarding paths
   FeatureBitset FeatureBits;           // Feature bits for current CPU + FS
 
+  MCSubtargetInfo() = delete;
+  MCSubtargetInfo &operator=(MCSubtargetInfo &&) = delete;
+  MCSubtargetInfo &operator=(const MCSubtargetInfo &) = delete;
+
 public:
-  void InitMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS,
-                           ArrayRef<SubtargetFeatureKV> PF,
-                           ArrayRef<SubtargetFeatureKV> PD,
-                           const SubtargetInfoKV *ProcSched,
-                           const MCWriteProcResEntry *WPR,
-                           const MCWriteLatencyEntry *WL,
-                           const MCReadAdvanceEntry *RA, const InstrStage *IS,
-                           const unsigned *OC, const unsigned *FP);
+  MCSubtargetInfo(const MCSubtargetInfo &) = default;
+  MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS,
+                  ArrayRef<SubtargetFeatureKV> PF,
+                  ArrayRef<SubtargetFeatureKV> PD,
+                  const SubtargetInfoKV *ProcSched,
+                  const MCWriteProcResEntry *WPR, const MCWriteLatencyEntry *WL,
+                  const MCReadAdvanceEntry *RA, const InstrStage *IS,
+                  const unsigned *OC, const unsigned *FP);
 
   /// getTargetTriple - Return the target triple string.
   const Triple &getTargetTriple() const { return TargetTriple; }
@@ -74,12 +78,16 @@ public:
     FeatureBits = FeatureBits_;
   }
 
-  /// InitMCProcessorInfo - Set or change the CPU (optionally supplemented with
-  /// feature string). Recompute feature bits and scheduling model.
+protected:
+  /// Initialize the scheduling model and feature bits.
+  ///
+  /// FIXME: Find a way to stick this in the constructor, since it should only
+  /// be called during initialization.
   void InitMCProcessorInfo(StringRef CPU, StringRef FS);
 
-  /// InitCPUSchedModel - Recompute scheduling model based on CPU.
-  void InitCPUSchedModel(StringRef CPU);
+public:
+  /// Set the features to the default for the given CPU.
+  void setDefaultFeatures(StringRef CPU);
 
   /// ToggleFeature - Toggle a feature and returns the re-computed feature
   /// bits. This version does not change the implied bits.
@@ -99,11 +107,10 @@ public:
 
   /// getSchedModelForCPU - Get the machine model of a CPU.
   ///
-  MCSchedModel getSchedModelForCPU(StringRef CPU) const;
+  const MCSchedModel &getSchedModelForCPU(StringRef CPU) const;
 
-  /// getSchedModel - Get the machine model for this subtarget's CPU.
-  ///
-  const MCSchedModel &getSchedModel() const { return CPUSchedModel; }
+  /// Get the machine model for this subtarget's CPU.
+  const MCSchedModel &getSchedModel() const { return *CPUSchedModel; }
 
   /// Return an iterator at the first process resource consumed by the given
   /// scheduling class.
@@ -151,7 +158,7 @@ public:
   void initInstrItins(InstrItineraryData &InstrItins) const;
 
   /// Check whether the CPU string is valid.
-  bool isCPUStringValid(StringRef CPU) {
+  bool isCPUStringValid(StringRef CPU) const {
     auto Found = std::find_if(ProcDesc.begin(), ProcDesc.end(),
                               [=](const SubtargetFeatureKV &KV) {
                                 return CPU == KV.Key; 
diff --git a/include/llvm/MC/MCSymbol.h b/include/llvm/MC/MCSymbol.h
index 17e6b857cf20..b2910dfccd63 100644
--- a/include/llvm/MC/MCSymbol.h
+++ b/include/llvm/MC/MCSymbol.h
@@ -114,12 +114,12 @@ protected:
   /// The alignment is stored as log2(align) + 1.  This allows all values from
   /// 0 to 2^31 to be stored which is every power of 2 representable by an
   /// unsigned.
-  static const unsigned NumCommonAlignmentBits = 5;
+  enum : unsigned { NumCommonAlignmentBits = 5 };
   unsigned CommonAlignLog2 : NumCommonAlignmentBits;
 
   /// The Flags field is used by object file implementations to store
   /// additional per symbol information which is not easily classified.
-  static const unsigned NumFlagsBits = 16;
+  enum : unsigned { NumFlagsBits = 16 };
   mutable uint32_t Flags : NumFlagsBits;
 
   /// Index field, for use by the object file implementation.
diff --git a/include/llvm/MC/MCSymbolMachO.h b/include/llvm/MC/MCSymbolMachO.h
index 166ae9e755a1..5b0321fe9f73 100644
--- a/include/llvm/MC/MCSymbolMachO.h
+++ b/include/llvm/MC/MCSymbolMachO.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_MC_MCSYMBOLMACHO_H
-#define setIsWeakExternal
+#define LLVM_MC_MCSYMBOLMACHO_H
 
 #include "llvm/MC/MCSymbol.h"
 
diff --git a/include/llvm/MC/MCTargetOptions.h b/include/llvm/MC/MCTargetOptions.h
index ce28a196e974..7f4f23eda27f 100644
--- a/include/llvm/MC/MCTargetOptions.h
+++ b/include/llvm/MC/MCTargetOptions.h
@@ -55,7 +55,7 @@ inline bool operator==(const MCTargetOptions &LHS, const MCTargetOptions &RHS) {
           ARE_EQUAL(ShowMCInst) &&
           ARE_EQUAL(AsmVerbose) &&
           ARE_EQUAL(DwarfVersion) &&
-	  ARE_EQUAL(ABIName));
+          ARE_EQUAL(ABIName));
 #undef ARE_EQUAL
 }
 
diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index 8da6919a4655..597f0d48c118 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h
@@ -94,9 +94,7 @@ public:
     /// \return the size in the archive header for this member.
     uint64_t getRawSize() const;
 
-    StringRef getBuffer() const {
-      return StringRef(Data.data() + StartOfFile, getSize());
-    }
+    ErrorOr<StringRef> getBuffer() const;
     uint64_t getChildOffset() const;
 
     ErrorOr<MemoryBufferRef> getMemoryBufferRef() const;
@@ -183,6 +181,7 @@ public:
   };
 
   Kind kind() const { return (Kind)Format; }
+  bool isThin() const { return IsThin; }
 
   child_iterator child_begin(bool SkipInternal = true) const;
   child_iterator child_end() const;
@@ -207,6 +206,11 @@ public:
 
   bool hasSymbolTable() const;
   child_iterator getSymbolTableChild() const { return SymbolTable; }
+  StringRef getSymbolTable() const {
+    // We know that the symbol table is not an external file,
+    // so we just assert there is no error.
+    return *SymbolTable->getBuffer();
+  }
   uint32_t getNumberOfSymbols() const;
 
 private:
@@ -215,6 +219,7 @@ private:
   child_iterator FirstRegular;
   unsigned Format : 2;
   unsigned IsThin : 1;
+  mutable std::vector<std::unique_ptr<MemoryBuffer>> ThinBuffers;
 };
 
 }
diff --git a/include/llvm/Object/ArchiveWriter.h b/include/llvm/Object/ArchiveWriter.h
index 1616e46d3e6f..3648d0c77fb5 100644
--- a/include/llvm/Object/ArchiveWriter.h
+++ b/include/llvm/Object/ArchiveWriter.h
@@ -31,7 +31,6 @@ class NewArchiveIterator {
 public:
   NewArchiveIterator(object::Archive::child_iterator I, StringRef Name);
   NewArchiveIterator(StringRef I, StringRef Name);
-  NewArchiveIterator();
   bool isNewMember() const;
   StringRef getName() const;
 
@@ -44,8 +43,7 @@ public:
 
 std::pair<StringRef, std::error_code>
 writeArchive(StringRef ArcName, std::vector<NewArchiveIterator> &NewMembers,
-             bool WriteSymtab);
-
+             bool WriteSymtab, object::Archive::Kind Kind, bool Deterministic);
 }
 
 #endif
diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h
index fc605826a8b0..025a9dbc6bc0 100644
--- a/include/llvm/Object/COFF.h
+++ b/include/llvm/Object/COFF.h
@@ -474,7 +474,7 @@ struct coff_import_header {
   support::ulittle16_t OrdinalHint;
   support::ulittle16_t TypeInfo;
   int getType() const { return TypeInfo & 0x3; }
-  int getNameType() const { return (TypeInfo & 0x7) >> 2; }
+  int getNameType() const { return (TypeInfo >> 2) & 0x7; }
 };
 
 struct coff_import_directory_table_entry {
@@ -648,9 +648,8 @@ public:
 protected:
   void moveSymbolNext(DataRefImpl &Symb) const override;
   ErrorOr<StringRef> getSymbolName(DataRefImpl Symb) const override;
-  std::error_code getSymbolAddress(DataRefImpl Symb,
-                                   uint64_t &Res) const override;
-  uint64_t getSymbolValue(DataRefImpl Symb) const override;
+  ErrorOr<uint64_t> getSymbolAddress(DataRefImpl Symb) const override;
+  uint64_t getSymbolValueImpl(DataRefImpl Symb) const override;
   uint64_t getCommonSymbolSizeImpl(DataRefImpl Symb) const override;
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
   SymbolRef::Type getSymbolType(DataRefImpl Symb) const override;
@@ -672,7 +671,6 @@ protected:
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
-  ErrorOr<uint64_t> getRelocationAddress(DataRefImpl Rel) const override;
   uint64_t getRelocationOffset(DataRefImpl Rel) const override;
   symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override;
   uint64_t getRelocationType(DataRefImpl Rel) const override;
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index 3b0c548ffe15..cc271851e6b0 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -139,6 +140,7 @@ public:
   typedef Elf_Verneed_Impl<ELFT> Elf_Verneed;
   typedef Elf_Vernaux_Impl<ELFT> Elf_Vernaux;
   typedef Elf_Versym_Impl<ELFT> Elf_Versym;
+  typedef Elf_Hash_Impl<ELFT> Elf_Hash;
   typedef ELFEntityIterator<const Elf_Dyn> Elf_Dyn_Iter;
   typedef iterator_range<Elf_Dyn_Iter> Elf_Dyn_Range;
   typedef ELFEntityIterator<const Elf_Rela> Elf_Rela_Iter;
@@ -174,8 +176,8 @@ private:
   StringRef DotShstrtab;                    // Section header string table.
   StringRef DotStrtab;                      // Symbol header string table.
   const Elf_Shdr *dot_symtab_sec = nullptr; // Symbol table section.
-  StringRef DynSymStrTab;                   // Dynnamic symbol string table.
   const Elf_Shdr *DotDynSymSec = nullptr;   // Dynamic symbol table section.
+  const Elf_Hash *HashTable = nullptr;
 
   const Elf_Shdr *SymbolTableSectionHeaderIndex = nullptr;
   DenseMap<const Elf_Sym *, ELF::Elf64_Word> ExtendedSymbolTable;
@@ -197,6 +199,7 @@ private:
 
   DynRegionInfo DynamicRegion;
   DynRegionInfo DynHashRegion;
+  DynRegionInfo DynStrRegion;
   DynRegionInfo DynRelaRegion;
 
   // Pointer to SONAME entry in dynamic string table
@@ -229,6 +232,8 @@ private:
   void LoadVersionNeeds(const Elf_Shdr *ec) const;
   void LoadVersionMap() const;
 
+  void scanDynamicTable();
+
 public:
   template<typename T>
   const T        *getEntry(uint32_t Section, uint32_t Entry) const;
@@ -237,6 +242,7 @@ public:
 
   const Elf_Shdr *getDotSymtabSec() const { return dot_symtab_sec; }
   const Elf_Shdr *getDotDynSymSec() const { return DotDynSymSec; }
+  const Elf_Hash *getHashTable() const { return HashTable; }
 
   ErrorOr<StringRef> getStringTable(const Elf_Shdr *Section) const;
   const char *getDynamicString(uintX_t Offset) const;
@@ -578,8 +584,10 @@ ELFFile<ELFT>::ELFFile(StringRef Object, std::error_code &EC)
 
   Header = reinterpret_cast<const Elf_Ehdr *>(base());
 
-  if (Header->e_shoff == 0)
+  if (Header->e_shoff == 0) {
+    scanDynamicTable();
     return;
+  }
 
   const uint64_t SectionTableOffset = Header->e_shoff;
 
@@ -604,6 +612,13 @@ ELFFile<ELFT>::ELFFile(StringRef Object, std::error_code &EC)
 
   for (const Elf_Shdr &Sec : sections()) {
     switch (Sec.sh_type) {
+    case ELF::SHT_HASH:
+      if (HashTable) {
+        EC = object_error::parse_failed;
+        return;
+      }
+      HashTable = reinterpret_cast<const Elf_Hash *>(base() + Sec.sh_offset);
+      break;
     case ELF::SHT_SYMTAB_SHNDX:
       if (SymbolTableSectionHeaderIndex) {
         // More than one .symtab_shndx!
@@ -640,7 +655,9 @@ ELFFile<ELFT>::ELFFile(StringRef Object, std::error_code &EC)
       ErrorOr<StringRef> SymtabOrErr = getStringTable(*SectionOrErr);
       if ((EC = SymtabOrErr.getError()))
         return;
-      DynSymStrTab = *SymtabOrErr;
+      DynStrRegion.Addr = SymtabOrErr->data();
+      DynStrRegion.Size = SymtabOrErr->size();
+      DynStrRegion.EntSize = 1;
       break;
     }
     case ELF::SHT_DYNAMIC:
@@ -701,7 +718,23 @@ ELFFile<ELFT>::ELFFile(StringRef Object, std::error_code &EC)
     }
   }
 
-  // Scan program headers.
+  scanDynamicTable();
+
+  EC = std::error_code();
+}
+
+template <class ELFT>
+void ELFFile<ELFT>::scanDynamicTable() {
+  // Build load-address to file-offset map.
+  typedef IntervalMap<
+      uintX_t, uintptr_t,
+      IntervalMapImpl::NodeSizer<uintX_t, uintptr_t>::LeafSize,
+      IntervalMapHalfOpenInfo<uintX_t>> LoadMapT;
+  typename LoadMapT::Allocator Alloc;
+  // Allocate the IntervalMap on the heap to work around MSVC bug where the
+  // stack doesn't get realigned despite LoadMap having alignment 8 (PR24113).
+  std::unique_ptr<LoadMapT> LoadMap(new LoadMapT(Alloc));
+
   for (Elf_Phdr_Iter PhdrI = program_header_begin(),
                      PhdrE = program_header_end();
        PhdrI != PhdrE; ++PhdrI) {
@@ -709,34 +742,44 @@ ELFFile<ELFT>::ELFFile(StringRef Object, std::error_code &EC)
       DynamicRegion.Addr = base() + PhdrI->p_offset;
       DynamicRegion.Size = PhdrI->p_filesz;
       DynamicRegion.EntSize = sizeof(Elf_Dyn);
-      break;
+      continue;
     }
+    if (PhdrI->p_type != ELF::PT_LOAD)
+      continue;
+    if (PhdrI->p_filesz == 0)
+      continue;
+    LoadMap->insert(PhdrI->p_vaddr, PhdrI->p_vaddr + PhdrI->p_filesz,
+                    PhdrI->p_offset);
   }
 
-  // Scan dynamic table.
+  auto toMappedAddr = [&](uint64_t VAddr) -> const uint8_t * {
+    auto I = LoadMap->find(VAddr);
+    if (I == LoadMap->end())
+      return nullptr;
+    return this->base() + I.value() + (VAddr - I.start());
+  };
+
   for (Elf_Dyn_Iter DynI = dynamic_table_begin(), DynE = dynamic_table_end();
        DynI != DynE; ++DynI) {
     switch (DynI->d_tag) {
-    case ELF::DT_RELA: {
-      uint64_t VBase = 0;
-      const uint8_t *FBase = nullptr;
-      for (Elf_Phdr_Iter PhdrI = program_header_begin(),
-                         PhdrE = program_header_end();
-           PhdrI != PhdrE; ++PhdrI) {
-        if (PhdrI->p_type != ELF::PT_LOAD)
-          continue;
-        if (DynI->getPtr() >= PhdrI->p_vaddr &&
-            DynI->getPtr() < PhdrI->p_vaddr + PhdrI->p_memsz) {
-          VBase = PhdrI->p_vaddr;
-          FBase = base() + PhdrI->p_offset;
-          break;
-        }
-      }
-      if (!VBase)
-        return;
-      DynRelaRegion.Addr = FBase + DynI->getPtr() - VBase;
+    case ELF::DT_HASH:
+      if (HashTable)
+        continue;
+      HashTable =
+          reinterpret_cast<const Elf_Hash *>(toMappedAddr(DynI->getPtr()));
+      break;
+    case ELF::DT_STRTAB:
+      if (!DynStrRegion.Addr)
+        DynStrRegion.Addr = toMappedAddr(DynI->getPtr());
+      break;
+    case ELF::DT_STRSZ:
+      if (!DynStrRegion.Size)
+        DynStrRegion.Size = DynI->getVal();
+      break;
+    case ELF::DT_RELA:
+      if (!DynRelaRegion.Addr)
+        DynRelaRegion.Addr = toMappedAddr(DynI->getPtr());
       break;
-    }
     case ELF::DT_RELASZ:
       DynRelaRegion.Size = DynI->getVal();
       break;
@@ -744,8 +787,6 @@ ELFFile<ELFT>::ELFFile(StringRef Object, std::error_code &EC)
       DynRelaRegion.EntSize = DynI->getVal();
     }
   }
-
-  EC = std::error_code();
 }
 
 template <class ELFT>
@@ -868,9 +909,9 @@ ELFFile<ELFT>::getStringTable(const Elf_Shdr *Section) const {
 
 template <class ELFT>
 const char *ELFFile<ELFT>::getDynamicString(uintX_t Offset) const {
-  if (!DotDynSymSec || Offset >= DynSymStrTab.size())
+  if (Offset >= DynStrRegion.Size)
     return nullptr;
-  return (const char *)DynSymStrTab.begin() + Offset;
+  return (const char *)DynStrRegion.Addr + Offset;
 }
 
 template <class ELFT>
@@ -983,7 +1024,7 @@ ErrorOr<StringRef> ELFFile<ELFT>::getSymbolVersion(const Elf_Shdr *section,
     IsDefault = false;
   }
 
-  if (name_offset >= DynSymStrTab.size())
+  if (name_offset >= DynStrRegion.Size)
     return object_error::parse_failed;
   return StringRef(getDynamicString(name_offset));
 }
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index 5b9b113a2f0b..6e8ace427a20 100644
--- a/include/llvm/Object/ELFObjectFile.h
+++ b/include/llvm/Object/ELFObjectFile.h
@@ -196,9 +196,8 @@ protected:
 
   void moveSymbolNext(DataRefImpl &Symb) const override;
   ErrorOr<StringRef> getSymbolName(DataRefImpl Symb) const override;
-  std::error_code getSymbolAddress(DataRefImpl Symb,
-                                   uint64_t &Res) const override;
-  uint64_t getSymbolValue(DataRefImpl Symb) const override;
+  ErrorOr<uint64_t> getSymbolAddress(DataRefImpl Symb) const override;
+  uint64_t getSymbolValueImpl(DataRefImpl Symb) const override;
   uint32_t getSymbolAlignment(DataRefImpl Symb) const override;
   uint64_t getCommonSymbolSizeImpl(DataRefImpl Symb) const override;
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
@@ -226,7 +225,6 @@ protected:
   section_iterator getRelocatedSection(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
-  ErrorOr<uint64_t> getRelocationAddress(DataRefImpl Rel) const override;
   uint64_t getRelocationOffset(DataRefImpl Rel) const override;
   symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override;
   uint64_t getRelocationType(DataRefImpl Rel) const override;
@@ -235,7 +233,6 @@ protected:
 
   uint32_t getSectionType(DataRefImpl Sec) const override;
   uint64_t getSectionFlags(DataRefImpl Sec) const override;
-  uint64_t getROffset(DataRefImpl Rel) const;
   StringRef getRelocationTypeName(uint32_t Type) const;
 
   /// \brief Get the relocation section that contains \a Rel.
@@ -276,11 +273,6 @@ protected:
     return DRI;
   }
 
-  Elf_Dyn_Iter toELFDynIter(DataRefImpl Dyn) const {
-    return Elf_Dyn_Iter(EF.begin_dynamic_table().getEntSize(),
-                        reinterpret_cast<const char *>(Dyn.p));
-  }
-
   DataRefImpl toDRI(Elf_Dyn_Iter Dyn) const {
     DataRefImpl DRI;
     DRI.p = reinterpret_cast<uintptr_t>(Dyn.get());
@@ -378,19 +370,13 @@ uint32_t ELFObjectFile<ELFT>::getSectionType(DataRefImpl Sec) const {
 }
 
 template <class ELFT>
-uint64_t ELFObjectFile<ELFT>::getSymbolValue(DataRefImpl Symb) const {
+uint64_t ELFObjectFile<ELFT>::getSymbolValueImpl(DataRefImpl Symb) const {
   const Elf_Sym *ESym = getSymbol(Symb);
-  switch (ESym->st_shndx) {
-  case ELF::SHN_COMMON:
-  case ELF::SHN_UNDEF:
-    return UnknownAddress;
-  case ELF::SHN_ABS:
-    return ESym->st_value;
-  }
-
-  const Elf_Ehdr *Header = EF.getHeader();
   uint64_t Ret = ESym->st_value;
+  if (ESym->st_shndx == ELF::SHN_ABS)
+    return Ret;
 
+  const Elf_Ehdr *Header = EF.getHeader();
   // Clear the ARM/Thumb or microMIPS indicator flag.
   if ((Header->e_machine == ELF::EM_ARM || Header->e_machine == ELF::EM_MIPS) &&
       ESym->getType() == ELF::STT_FUNC)
@@ -400,15 +386,15 @@ uint64_t ELFObjectFile<ELFT>::getSymbolValue(DataRefImpl Symb) const {
 }
 
 template <class ELFT>
-std::error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
-                                                      uint64_t &Result) const {
-  Result = getSymbolValue(Symb);
+ErrorOr<uint64_t>
+ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb) const {
+  uint64_t Result = getSymbolValue(Symb);
   const Elf_Sym *ESym = getSymbol(Symb);
   switch (ESym->st_shndx) {
   case ELF::SHN_COMMON:
   case ELF::SHN_UNDEF:
   case ELF::SHN_ABS:
-    return std::error_code();
+    return Result;
   }
 
   const Elf_Ehdr *Header = EF.getHeader();
@@ -422,7 +408,7 @@ std::error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
       Result += Section->sh_addr;
   }
 
-  return std::error_code();
+  return Result;
 }
 
 template <class ELFT>
@@ -689,31 +675,9 @@ ELFObjectFile<ELFT>::getRelocationSymbol(DataRefImpl Rel) const {
 }
 
 template <class ELFT>
-ErrorOr<uint64_t>
-ELFObjectFile<ELFT>::getRelocationAddress(DataRefImpl Rel) const {
-  uint64_t ROffset = getROffset(Rel);
-  const Elf_Ehdr *Header = EF.getHeader();
-
-  if (Header->e_type == ELF::ET_REL) {
-    const Elf_Shdr *RelocationSec = getRelSection(Rel);
-    ErrorOr<const Elf_Shdr *> RelocatedSec =
-        EF.getSection(RelocationSec->sh_info);
-    if (std::error_code EC = RelocatedSec.getError())
-      return EC;
-    return ROffset + (*RelocatedSec)->sh_addr;
-  }
-  return ROffset;
-}
-
-template <class ELFT>
 uint64_t ELFObjectFile<ELFT>::getRelocationOffset(DataRefImpl Rel) const {
   assert(EF.getHeader()->e_type == ELF::ET_REL &&
          "Only relocatable object files have relocation offsets");
-  return getROffset(Rel);
-}
-
-template <class ELFT>
-uint64_t ELFObjectFile<ELFT>::getROffset(DataRefImpl Rel) const {
   const Elf_Shdr *sec = getRelSection(Rel);
   if (sec->sh_type == ELF::SHT_REL)
     return getRel(Rel)->r_offset;
diff --git a/include/llvm/Object/ELFTypes.h b/include/llvm/Object/ELFTypes.h
index 63e13909ae5c..27e987ba2852 100644
--- a/include/llvm/Object/ELFTypes.h
+++ b/include/llvm/Object/ELFTypes.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_OBJECT_ELFTYPES_H
 #define LLVM_OBJECT_ELFTYPES_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ELF.h"
@@ -463,6 +464,23 @@ struct Elf_Phdr_Impl<ELFType<TargetEndianness, true>> {
   Elf_Xword p_align;  // Segment alignment constraint
 };
 
+// ELFT needed for endianess.
+template <class ELFT>
+struct Elf_Hash_Impl {
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+  Elf_Word nbucket;
+  Elf_Word nchain;
+
+  ArrayRef<Elf_Word> buckets() const {
+    return ArrayRef<Elf_Word>(&nbucket + 2, &nbucket + 2 + nbucket);
+  }
+
+  ArrayRef<Elf_Word> chains() const {
+    return ArrayRef<Elf_Word>(&nbucket + 2 + nbucket,
+                              &nbucket + 2 + nbucket + nchain);
+  }
+};
+
 // MIPS .reginfo section
 template <class ELFT>
 struct Elf_Mips_RegInfo;
diff --git a/include/llvm/Object/ELFYAML.h b/include/llvm/Object/ELFYAML.h
index b45507920a9c..df0aa500c8a2 100644
--- a/include/llvm/Object/ELFYAML.h
+++ b/include/llvm/Object/ELFYAML.h
@@ -85,7 +85,13 @@ struct SectionOrType {
 };
 
 struct Section {
-  enum class SectionKind { Group, RawContent, Relocation, MipsABIFlags };
+  enum class SectionKind {
+    Group,
+    RawContent,
+    Relocation,
+    NoBits,
+    MipsABIFlags
+  };
   SectionKind Kind;
   StringRef Name;
   ELF_SHT Type;
@@ -106,6 +112,14 @@ struct RawContentSection : Section {
   }
 };
 
+struct NoBitsSection : Section {
+  llvm::yaml::Hex64 Size;
+  NoBitsSection() : Section(SectionKind::NoBits) {}
+  static bool classof(const Section *S) {
+    return S->Kind == SectionKind::NoBits;
+  }
+};
+
 struct Group : Section {
   // Members of a group contain a flag and a list of section indices
   // that are part of the group.
diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
index f4edfd057303..489ecef5c996 100644
--- a/include/llvm/Object/MachO.h
+++ b/include/llvm/Object/MachO.h
@@ -205,9 +205,7 @@ public:
   std::error_code getIndirectName(DataRefImpl Symb, StringRef &Res) const;
   unsigned getSectionType(SectionRef Sec) const;
 
-  std::error_code getSymbolAddress(DataRefImpl Symb,
-                                   uint64_t &Res) const override;
-  uint64_t getSymbolValue(DataRefImpl Symb) const override;
+  ErrorOr<uint64_t> getSymbolAddress(DataRefImpl Symb) const override;
   uint32_t getSymbolAlignment(DataRefImpl Symb) const override;
   uint64_t getCommonSymbolSizeImpl(DataRefImpl Symb) const override;
   SymbolRef::Type getSymbolType(DataRefImpl Symb) const override;
@@ -233,7 +231,6 @@ public:
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
-  ErrorOr<uint64_t> getRelocationAddress(DataRefImpl Rel) const override;
   uint64_t getRelocationOffset(DataRefImpl Rel) const override;
   symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override;
   section_iterator getRelocationSection(DataRefImpl Rel) const;
@@ -245,6 +242,8 @@ public:
   // MachO specific.
   std::error_code getLibraryShortNameByIndex(unsigned Index, StringRef &) const;
 
+  section_iterator getRelocationRelocatedSection(relocation_iterator Rel) const;
+
   // TODO: Would be useful to have an iterator based version
   // of the load command interface too.
 
@@ -425,6 +424,8 @@ public:
   }
 
 private:
+  uint64_t getSymbolValueImpl(DataRefImpl Symb) const override;
+
   union {
     MachO::mach_header_64 Header64;
     MachO::mach_header Header;
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 62eab1066be5..8dd525626218 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -50,7 +50,6 @@ public:
 
   void moveNext();
 
-  ErrorOr<uint64_t> getAddress() const;
   uint64_t getOffset() const;
   symbol_iterator getSymbol() const;
   uint64_t getType() const;
@@ -135,7 +134,7 @@ public:
   ErrorOr<StringRef> getName() const;
   /// Returns the symbol virtual address (i.e. address at which it will be
   /// mapped).
-  std::error_code getAddress(uint64_t &Result) const;
+  ErrorOr<uint64_t> getAddress() const;
 
   /// Return the value of the symbol depending on the object this can be an
   /// offset or a virtual address.
@@ -198,9 +197,8 @@ protected:
   virtual ErrorOr<StringRef> getSymbolName(DataRefImpl Symb) const = 0;
   std::error_code printSymbolName(raw_ostream &OS,
                                   DataRefImpl Symb) const override;
-  virtual std::error_code getSymbolAddress(DataRefImpl Symb,
-                                           uint64_t &Res) const = 0;
-  virtual uint64_t getSymbolValue(DataRefImpl Symb) const = 0;
+  virtual ErrorOr<uint64_t> getSymbolAddress(DataRefImpl Symb) const = 0;
+  virtual uint64_t getSymbolValueImpl(DataRefImpl Symb) const = 0;
   virtual uint32_t getSymbolAlignment(DataRefImpl Symb) const;
   virtual uint64_t getCommonSymbolSizeImpl(DataRefImpl Symb) const = 0;
   virtual SymbolRef::Type getSymbolType(DataRefImpl Symb) const = 0;
@@ -229,13 +227,14 @@ protected:
   // Same as above for RelocationRef.
   friend class RelocationRef;
   virtual void moveRelocationNext(DataRefImpl &Rel) const = 0;
-  virtual ErrorOr<uint64_t> getRelocationAddress(DataRefImpl Rel) const = 0;
   virtual uint64_t getRelocationOffset(DataRefImpl Rel) const = 0;
   virtual symbol_iterator getRelocationSymbol(DataRefImpl Rel) const = 0;
   virtual uint64_t getRelocationType(DataRefImpl Rel) const = 0;
   virtual void getRelocationTypeName(DataRefImpl Rel,
                                      SmallVectorImpl<char> &Result) const = 0;
 
+  uint64_t getSymbolValue(DataRefImpl Symb) const;
+
 public:
   uint64_t getCommonSymbolSize(DataRefImpl Symb) const {
     assert(getSymbolFlags(Symb) & SymbolRef::SF_Common);
@@ -308,8 +307,8 @@ inline ErrorOr<StringRef> SymbolRef::getName() const {
   return getObject()->getSymbolName(getRawDataRefImpl());
 }
 
-inline std::error_code SymbolRef::getAddress(uint64_t &Result) const {
-  return getObject()->getSymbolAddress(getRawDataRefImpl(), Result);
+inline ErrorOr<uint64_t> SymbolRef::getAddress() const {
+  return getObject()->getSymbolAddress(getRawDataRefImpl());
 }
 
 inline uint64_t SymbolRef::getValue() const {
@@ -430,10 +429,6 @@ inline void RelocationRef::moveNext() {
   return OwningObject->moveRelocationNext(RelocationPimpl);
 }
 
-inline ErrorOr<uint64_t> RelocationRef::getAddress() const {
-  return OwningObject->getRelocationAddress(RelocationPimpl);
-}
-
 inline uint64_t RelocationRef::getOffset() const {
   return OwningObject->getRelocationOffset(RelocationPimpl);
 }
diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h
index 950e2ed0e338..d5e4258cb0a7 100644
--- a/include/llvm/Object/RelocVisitor.h
+++ b/include/llvm/Object/RelocVisitor.h
@@ -100,9 +100,9 @@ private:
       case Triple::mips64:
         switch (RelocType) {
         case llvm::ELF::R_MIPS_32:
-          return visitELF_MIPS_32(R, Value);
+          return visitELF_MIPS64_32(R, Value);
         case llvm::ELF::R_MIPS_64:
-          return visitELF_MIPS_64(R, Value);
+          return visitELF_MIPS64_64(R, Value);
         default:
           HasError = true;
           return RelocToApply();
@@ -313,11 +313,18 @@ private:
 
   /// MIPS ELF
   RelocToApply visitELF_MIPS_32(RelocationRef R, uint64_t Value) {
-    uint32_t Res = (Value)&0xFFFFFFFF;
+    uint32_t Res = Value & 0xFFFFFFFF;
+    return RelocToApply(Res, 4);
+  }
+
+  /// MIPS64 ELF
+  RelocToApply visitELF_MIPS64_32(RelocationRef R, uint64_t Value) {
+    int64_t Addend = getELFAddend(R);
+    uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
     return RelocToApply(Res, 4);
   }
 
-  RelocToApply visitELF_MIPS_64(RelocationRef R, uint64_t Value) {
+  RelocToApply visitELF_MIPS64_64(RelocationRef R, uint64_t Value) {
     int64_t Addend = getELFAddend(R);
     uint64_t Res = (Value + Addend);
     return RelocToApply(Res, 8);
diff --git a/include/llvm/Object/SymbolicFile.h b/include/llvm/Object/SymbolicFile.h
index 3a3823159c92..537997ac6318 100644
--- a/include/llvm/Object/SymbolicFile.h
+++ b/include/llvm/Object/SymbolicFile.h
@@ -115,8 +115,6 @@ public:
 
 typedef content_iterator<BasicSymbolRef> basic_symbol_iterator;
 
-const uint64_t UnknownAddress = ~0ULL;
-
 class SymbolicFile : public Binary {
 public:
   ~SymbolicFile() override;
diff --git a/include/llvm/Support/COFF.h b/include/llvm/Support/COFF.h
index b26af61a7c70..3c5ee06969d0 100644
--- a/include/llvm/Support/COFF.h
+++ b/include/llvm/Support/COFF.h
@@ -655,6 +655,7 @@ namespace COFF {
   };
 
   enum CodeViewIdentifiers {
+    DEBUG_LINE_TABLES_HAVE_COLUMN_RECORDS = 0x1,
     DEBUG_SECTION_MAGIC = 0x4,
     DEBUG_SYMBOL_SUBSECTION = 0xF1,
     DEBUG_LINE_TABLE_SUBSECTION = 0xF2,
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index ed809211ea97..379d06a65741 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -790,7 +790,7 @@ public:
   void anchor() override;
 };
 
-EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<bool>);
+extern template class basic_parser<bool>;
 
 //--------------------------------------------------
 // parser<boolOrDefault>
@@ -816,7 +816,7 @@ public:
   void anchor() override;
 };
 
-EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<boolOrDefault>);
+extern template class basic_parser<boolOrDefault>;
 
 //--------------------------------------------------
 // parser<int>
@@ -838,7 +838,7 @@ public:
   void anchor() override;
 };
 
-EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<int>);
+extern template class basic_parser<int>;
 
 //--------------------------------------------------
 // parser<unsigned>
@@ -860,7 +860,7 @@ public:
   void anchor() override;
 };
 
-EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<unsigned>);
+extern template class basic_parser<unsigned>;
 
 //--------------------------------------------------
 // parser<unsigned long long>
@@ -885,7 +885,7 @@ public:
   void anchor() override;
 };
 
-EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<unsigned long long>);
+extern template class basic_parser<unsigned long long>;
 
 //--------------------------------------------------
 // parser<double>
@@ -907,7 +907,7 @@ public:
   void anchor() override;
 };
 
-EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<double>);
+extern template class basic_parser<double>;
 
 //--------------------------------------------------
 // parser<float>
@@ -929,7 +929,7 @@ public:
   void anchor() override;
 };
 
-EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<float>);
+extern template class basic_parser<float>;
 
 //--------------------------------------------------
 // parser<std::string>
@@ -954,7 +954,7 @@ public:
   void anchor() override;
 };
 
-EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<std::string>);
+extern template class basic_parser<std::string>;
 
 //--------------------------------------------------
 // parser<char>
@@ -979,7 +979,7 @@ public:
   void anchor() override;
 };
 
-EXTERN_TEMPLATE_INSTANTIATION(class basic_parser<char>);
+extern template class basic_parser<char>;
 
 //--------------------------------------------------
 // PrintOptionDiff
@@ -1254,11 +1254,11 @@ public:
   }
 };
 
-EXTERN_TEMPLATE_INSTANTIATION(class opt<unsigned>);
-EXTERN_TEMPLATE_INSTANTIATION(class opt<int>);
-EXTERN_TEMPLATE_INSTANTIATION(class opt<std::string>);
-EXTERN_TEMPLATE_INSTANTIATION(class opt<char>);
-EXTERN_TEMPLATE_INSTANTIATION(class opt<bool>);
+extern template class opt<unsigned>;
+extern template class opt<int>;
+extern template class opt<std::string>;
+extern template class opt<char>;
+extern template class opt<bool>;
 
 //===----------------------------------------------------------------------===//
 // list_storage class
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index 67ef23d43c99..141639839cc2 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -174,19 +174,6 @@
 #define LLVM_UNLIKELY(EXPR) (EXPR)
 #endif
 
-// C++ doesn't support 'extern template' of template specializations.  GCC does,
-// but requires __extension__ before it.  In the header, use this:
-//   EXTERN_TEMPLATE_INSTANTIATION(class foo<bar>);
-// in the .cpp file, use this:
-//   TEMPLATE_INSTANTIATION(class foo<bar>);
-#ifdef __GNUC__
-#define EXTERN_TEMPLATE_INSTANTIATION(X) __extension__ extern template X
-#define TEMPLATE_INSTANTIATION(X) template X
-#else
-#define EXTERN_TEMPLATE_INSTANTIATION(X)
-#define TEMPLATE_INSTANTIATION(X)
-#endif
-
 /// LLVM_ATTRIBUTE_NOINLINE - On compilers where we have a directive to do so,
 /// mark a method "not for inlining".
 #if __has_attribute(noinline) || LLVM_GNUC_PREREQ(3, 4, 0)
diff --git a/include/llvm/Support/OnDiskHashTable.h b/include/llvm/Support/OnDiskHashTable.h
index 0f097f287286..08e277ad5ce1 100644
--- a/include/llvm/Support/OnDiskHashTable.h
+++ b/include/llvm/Support/OnDiskHashTable.h
@@ -280,13 +280,19 @@ public:
   };
 
   /// \brief Look up the stored data for a particular key.
-  iterator find(const external_key_type &EKey, Info *InfoPtr = 0) {
-    if (!InfoPtr)
-      InfoPtr = &InfoObj;
-
-    using namespace llvm::support;
+  iterator find(const external_key_type &EKey, Info *InfoPtr = nullptr) {
     const internal_key_type &IKey = InfoObj.GetInternalKey(EKey);
     hash_value_type KeyHash = InfoObj.ComputeHash(IKey);
+    return find_hashed(IKey, KeyHash, InfoPtr);
+  }
+
+  /// \brief Look up the stored data for a particular key with a known hash.
+  iterator find_hashed(const internal_key_type &IKey, hash_value_type KeyHash,
+                       Info *InfoPtr = nullptr) {
+    using namespace llvm::support;
+
+    if (!InfoPtr)
+      InfoPtr = &InfoObj;
 
     // Each bucket is just an offset into the hash table file.
     offset_type Idx = KeyHash & (NumBuckets - 1);
diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h
index d2e8b95d74f3..40bf6fb20c9f 100644
--- a/include/llvm/Support/TargetRegistry.h
+++ b/include/llvm/Support/TargetRegistry.h
@@ -71,7 +71,7 @@ MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
 
 MCRelocationInfo *createMCRelocationInfo(const Triple &TT, MCContext &Ctx);
 
-MCSymbolizer *createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
+MCSymbolizer *createMCSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
                                  LLVMSymbolLookupCallback SymbolLookUp,
                                  void *DisInfo, MCContext *Ctx,
                                  std::unique_ptr<MCRelocationInfo> &&RelInfo);
@@ -92,17 +92,18 @@ public:
 
   typedef MCAsmInfo *(*MCAsmInfoCtorFnTy)(const MCRegisterInfo &MRI,
                                           const Triple &TT);
-  typedef MCCodeGenInfo *(*MCCodeGenInfoCtorFnTy)(StringRef TT, Reloc::Model RM,
+  typedef MCCodeGenInfo *(*MCCodeGenInfoCtorFnTy)(const Triple &TT,
+                                                  Reloc::Model RM,
                                                   CodeModel::Model CM,
                                                   CodeGenOpt::Level OL);
   typedef MCInstrInfo *(*MCInstrInfoCtorFnTy)(void);
   typedef MCInstrAnalysis *(*MCInstrAnalysisCtorFnTy)(const MCInstrInfo *Info);
-  typedef MCRegisterInfo *(*MCRegInfoCtorFnTy)(StringRef TT);
+  typedef MCRegisterInfo *(*MCRegInfoCtorFnTy)(const Triple &TT);
   typedef MCSubtargetInfo *(*MCSubtargetInfoCtorFnTy)(const Triple &TT,
                                                       StringRef CPU,
                                                       StringRef Features);
   typedef TargetMachine *(*TargetMachineCtorTy)(
-      const Target &T, StringRef TT, StringRef CPU, StringRef Features,
+      const Target &T, const Triple &TT, StringRef CPU, StringRef Features,
       const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
       CodeGenOpt::Level OL);
   // If it weren't for layering issues (this header is in llvm/Support, but
@@ -150,7 +151,7 @@ public:
   typedef MCRelocationInfo *(*MCRelocationInfoCtorTy)(const Triple &TT,
                                                       MCContext &Ctx);
   typedef MCSymbolizer *(*MCSymbolizerCtorTy)(
-      StringRef TT, LLVMOpInfoCallback GetOpInfo,
+      const Triple &TT, LLVMOpInfoCallback GetOpInfo,
       LLVMSymbolLookupCallback SymbolLookUp, void *DisInfo, MCContext *Ctx,
       std::unique_ptr<MCRelocationInfo> &&RelInfo);
 
@@ -300,12 +301,12 @@ public:
 
   /// createMCCodeGenInfo - Create a MCCodeGenInfo implementation.
   ///
-  MCCodeGenInfo *createMCCodeGenInfo(StringRef Triple, Reloc::Model RM,
+  MCCodeGenInfo *createMCCodeGenInfo(StringRef TT, Reloc::Model RM,
                                      CodeModel::Model CM,
                                      CodeGenOpt::Level OL) const {
     if (!MCCodeGenInfoCtorFn)
       return nullptr;
-    return MCCodeGenInfoCtorFn(Triple, RM, CM, OL);
+    return MCCodeGenInfoCtorFn(Triple(TT), RM, CM, OL);
   }
 
   /// createMCInstrInfo - Create a MCInstrInfo implementation.
@@ -326,10 +327,10 @@ public:
 
   /// createMCRegInfo - Create a MCRegisterInfo implementation.
   ///
-  MCRegisterInfo *createMCRegInfo(StringRef Triple) const {
+  MCRegisterInfo *createMCRegInfo(StringRef TT) const {
     if (!MCRegInfoCtorFn)
       return nullptr;
-    return MCRegInfoCtorFn(Triple);
+    return MCRegInfoCtorFn(Triple(TT));
   }
 
   /// createMCSubtargetInfo - Create a MCSubtargetInfo implementation.
@@ -351,20 +352,20 @@ public:
   /// createTargetMachine - Create a target specific machine implementation
   /// for the specified \p Triple.
   ///
-  /// \param Triple This argument is used to determine the target machine
+  /// \param TT This argument is used to determine the target machine
   /// feature set; it should always be provided. Generally this should be
   /// either the target triple from the module, or the target triple of the
   /// host if that does not exist.
   TargetMachine *
-  createTargetMachine(StringRef Triple, StringRef CPU, StringRef Features,
+  createTargetMachine(StringRef TT, StringRef CPU, StringRef Features,
                       const TargetOptions &Options,
                       Reloc::Model RM = Reloc::Default,
                       CodeModel::Model CM = CodeModel::Default,
                       CodeGenOpt::Level OL = CodeGenOpt::Default) const {
     if (!TargetMachineCtorFn)
       return nullptr;
-    return TargetMachineCtorFn(*this, Triple, CPU, Features, Options, RM, CM,
-                               OL);
+    return TargetMachineCtorFn(*this, Triple(TT), CPU, Features, Options, RM,
+                               CM, OL);
   }
 
   /// createMCAsmBackend - Create a target specific assembly parser.
@@ -529,7 +530,8 @@ public:
                      std::unique_ptr<MCRelocationInfo> &&RelInfo) const {
     MCSymbolizerCtorTy Fn =
         MCSymbolizerCtorFn ? MCSymbolizerCtorFn : llvm::createMCSymbolizer;
-    return Fn(TT, GetOpInfo, SymbolLookUp, DisInfo, Ctx, std::move(RelInfo));
+    return Fn(Triple(TT), GetOpInfo, SymbolLookUp, DisInfo, Ctx,
+              std::move(RelInfo));
   }
 
   /// @}
@@ -924,7 +926,7 @@ template <class MCCodeGenInfoImpl> struct RegisterMCCodeGenInfo {
   }
 
 private:
-  static MCCodeGenInfo *Allocator(StringRef /*TT*/, Reloc::Model /*RM*/,
+  static MCCodeGenInfo *Allocator(const Triple & /*TT*/, Reloc::Model /*RM*/,
                                   CodeModel::Model /*CM*/,
                                   CodeGenOpt::Level /*OL*/) {
     return new MCCodeGenInfoImpl();
@@ -1023,7 +1025,7 @@ template <class MCRegisterInfoImpl> struct RegisterMCRegInfo {
   }
 
 private:
-  static MCRegisterInfo *Allocator(StringRef /*TT*/) {
+  static MCRegisterInfo *Allocator(const Triple & /*TT*/) {
     return new MCRegisterInfoImpl();
   }
 };
@@ -1090,11 +1092,11 @@ template <class TargetMachineImpl> struct RegisterTargetMachine {
   }
 
 private:
-  static TargetMachine *Allocator(const Target &T, StringRef TT, StringRef CPU,
-                                  StringRef FS, const TargetOptions &Options,
-                                  Reloc::Model RM, CodeModel::Model CM,
-                                  CodeGenOpt::Level OL) {
-    return new TargetMachineImpl(T, Triple(TT), CPU, FS, Options, RM, CM, OL);
+  static TargetMachine *Allocator(const Target &T, const Triple &TT,
+                                  StringRef CPU, StringRef FS,
+                                  const TargetOptions &Options, Reloc::Model RM,
+                                  CodeModel::Model CM, CodeGenOpt::Level OL) {
+    return new TargetMachineImpl(T, TT, CPU, FS, Options, RM, CM, OL);
   }
 };
 
diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index b59317112c44..28e512c86941 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h
@@ -165,8 +165,10 @@ public:
     if (Size > (size_t)(OutBufEnd - OutBufCur))
       return write(Str.data(), Size);
 
-    memcpy(OutBufCur, Str.data(), Size);
-    OutBufCur += Size;
+    if (Size) {
+      memcpy(OutBufCur, Str.data(), Size);
+      OutBufCur += Size;
+    }
     return *this;
   }
 
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index 717a2a4ba62a..b4642c991192 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -1222,11 +1222,11 @@ public:
   /// get the corresponding DefInit.
   DefInit *getDefInit();
 
-  const std::vector<Init *> &getTemplateArgs() const {
+  ArrayRef<Init *> getTemplateArgs() const {
     return TemplateArgs;
   }
-  const std::vector<RecordVal> &getValues() const { return Values; }
-  const std::vector<Record*>   &getSuperClasses() const { return SuperClasses; }
+  ArrayRef<RecordVal> getValues() const { return Values; }
+  ArrayRef<Record *>  getSuperClasses() const { return SuperClasses; }
   ArrayRef<SMRange> getSuperClassRanges() const { return SuperClassRanges; }
 
   bool isTemplateArg(Init *Name) const {
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 61234991be44..e0aea181a639 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -872,7 +872,7 @@ def LOAD_STACK_GUARD : Instruction {
   let hasSideEffects = 0;
   bit isPseudo = 1;
 }
-def FRAME_ALLOC : Instruction {
+def LOCAL_ESCAPE : Instruction {
   // This instruction is really just a label. It has to be part of the chain so
   // that it doesn't get dropped from the DAG, but it produces nothing and has
   // no side effects.
@@ -1014,7 +1014,7 @@ class InstAlias<string Asm, dag Result, int Emit = 1> {
   // Predicates - Predicates that must be true for this to match.
   list<Predicate> Predicates = [];
 
-	// If the instruction specified in Result has defined an AsmMatchConverter
+  // If the instruction specified in Result has defined an AsmMatchConverter
   // then setting this to 1 will cause the alias to use the AsmMatchConverter
   // function when converting the OperandVector into an MCInst instead of the
   // function that is generated by the dag Result.
diff --git a/include/llvm/Target/TargetFrameLowering.h b/include/llvm/Target/TargetFrameLowering.h
index 0e317247a59f..3af2227410f7 100644
--- a/include/llvm/Target/TargetFrameLowering.h
+++ b/include/llvm/Target/TargetFrameLowering.h
@@ -19,6 +19,7 @@
 #include <vector>
 
 namespace llvm {
+  class BitVector;
   class CalleeSavedInfo;
   class MachineFunction;
   class RegScavenger;
@@ -226,13 +227,15 @@ public:
     return 0;
   }
 
-  /// processFunctionBeforeCalleeSavedScan - This method is called immediately
-  /// before PrologEpilogInserter scans the physical registers used to determine
-  /// what callee saved registers should be spilled. This method is optional.
-  virtual void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                             RegScavenger *RS = nullptr) const {
-
-  }
+  /// This method determines which of the registers reported by
+  /// TargetRegisterInfo::getCalleeSavedRegs() should actually get saved.
+  /// The default implementation checks populates the \p SavedRegs bitset with
+  /// all registers which are modified in the function, targets may override
+  /// this function to save additional registers.
+  /// This method also sets up the register scavenger ensuring there is a free
+  /// register or a frameindex available.
+  virtual void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                                    RegScavenger *RS = nullptr) const;
 
   /// processFunctionBeforeFrameFinalized - This method is called immediately
   /// before the specified function's frame layout (MF.getFrameInfo()) is
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 277487fee6bc..4412d9b3c68e 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -161,27 +161,27 @@ protected:
 
 public:
   const TargetMachine &getTargetMachine() const { return TM; }
-  const DataLayout *getDataLayout() const { return TM.getDataLayout(); }
 
-  bool isBigEndian() const { return !IsLittleEndian; }
-  bool isLittleEndian() const { return IsLittleEndian; }
   virtual bool useSoftFloat() const { return false; }
 
   /// Return the pointer type for the given address space, defaults to
   /// the pointer type from the data layout.
   /// FIXME: The default needs to be removed once all the code is updated.
-  virtual MVT getPointerTy(uint32_t /*AS*/ = 0) const;
-  unsigned getPointerSizeInBits(uint32_t AS = 0) const;
-  unsigned getPointerTypeSizeInBits(Type *Ty) const;
-  virtual MVT getScalarShiftAmountTy(EVT LHSTy) const;
+  MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const {
+    return MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
+  }
+
+  /// EVT is not used in-tree, but is used by out-of-tree target.
+  /// A documentation for this function would be nice...
+  virtual MVT getScalarShiftAmountTy(const DataLayout &, EVT) const;
 
-  EVT getShiftAmountTy(EVT LHSTy) const;
+  EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const;
 
   /// Returns the type to be used for the index operand of:
   /// ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
   /// ISD::INSERT_SUBVECTOR, and ISD::EXTRACT_SUBVECTOR
-  virtual MVT getVectorIdxTy() const {
-    return getPointerTy();
+  virtual MVT getVectorIdxTy(const DataLayout &DL) const {
+    return getPointerTy(DL);
   }
 
   /// Return true if the select operation is expensive for this target.
@@ -327,7 +327,8 @@ public:
   }
 
   /// Return the ValueType of the result of SETCC operations.
-  virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+  virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                                 EVT VT) const;
 
   /// Return the ValueType for comparison libcalls. Comparions libcalls include
   /// floating point comparion calls, and Ordered/Unordered check calls on
@@ -715,17 +716,18 @@ public:
   /// operations except for the pointer size.  If AllowUnknown is true, this
   /// will return MVT::Other for types with no EVT counterpart (e.g. structs),
   /// otherwise it will assert.
-  EVT getValueType(Type *Ty, bool AllowUnknown = false) const {
+  EVT getValueType(const DataLayout &DL, Type *Ty,
+                   bool AllowUnknown = false) const {
     // Lower scalar pointers to native pointer types.
     if (PointerType *PTy = dyn_cast<PointerType>(Ty))
-      return getPointerTy(PTy->getAddressSpace());
+      return getPointerTy(DL, PTy->getAddressSpace());
 
     if (Ty->isVectorTy()) {
       VectorType *VTy = cast<VectorType>(Ty);
       Type *Elm = VTy->getElementType();
       // Lower vectors of pointers to native pointer types.
       if (PointerType *PT = dyn_cast<PointerType>(Elm)) {
-        EVT PointerTy(getPointerTy(PT->getAddressSpace()));
+        EVT PointerTy(getPointerTy(DL, PT->getAddressSpace()));
         Elm = PointerTy.getTypeForEVT(Ty->getContext());
       }
 
@@ -736,14 +738,15 @@ public:
   }
 
   /// Return the MVT corresponding to this LLVM type. See getValueType.
-  MVT getSimpleValueType(Type *Ty, bool AllowUnknown = false) const {
-    return getValueType(Ty, AllowUnknown).getSimpleVT();
+  MVT getSimpleValueType(const DataLayout &DL, Type *Ty,
+                         bool AllowUnknown = false) const {
+    return getValueType(DL, Ty, AllowUnknown).getSimpleVT();
   }
 
   /// Return the desired alignment for ByVal or InAlloca aggregate function
   /// arguments in the caller parameter area.  This is the actual alignment, not
   /// its logarithm.
-  virtual unsigned getByValTypeAlignment(Type *Ty) const;
+  virtual unsigned getByValTypeAlignment(Type *Ty, const DataLayout &DL) const;
 
   /// Return the type of registers that this ValueType will eventually require.
   MVT getRegisterType(MVT VT) const {
@@ -818,8 +821,8 @@ public:
   /// When splitting a value of the specified type into parts, does the Lo
   /// or Hi part come first?  This usually follows the endianness, except
   /// for ppcf128, where the Hi part always comes first.
-  bool hasBigEndianPartOrdering(EVT VT) const {
-    return isBigEndian() || VT == MVT::ppcf128;
+  bool hasBigEndianPartOrdering(EVT VT, const DataLayout &DL) const {
+    return DL.isBigEndian() || VT == MVT::ppcf128;
   }
 
   /// If true, the target has custom DAG combine transformations that it can
@@ -1006,7 +1009,8 @@ public:
   int InstructionOpcodeToISD(unsigned Opcode) const;
 
   /// Estimate the cost of type-legalization and the legalized type.
-  std::pair<unsigned, MVT> getTypeLegalizationCost(Type *Ty) const;
+  std::pair<unsigned, MVT> getTypeLegalizationCost(const DataLayout &DL,
+                                                   Type *Ty) const;
 
   /// @}
 
@@ -1460,8 +1464,8 @@ public:
   /// If the address space cannot be determined, it will be -1.
   ///
   /// TODO: Remove default argument
-  virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                                     unsigned AddrSpace) const;
+  virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                                     Type *Ty, unsigned AddrSpace) const;
 
   /// \brief Return the cost of the scaling factor used in the addressing mode
   /// represented by AM for this target, for a load/store of the specified type.
@@ -1470,10 +1474,10 @@ public:
   /// If the AM is not supported, it returns a negative value.
   /// TODO: Handle pre/postinc as well.
   /// TODO: Remove default argument
-  virtual int getScalingFactorCost(const AddrMode &AM, Type *Ty,
-                                   unsigned AS = 0) const {
+  virtual int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM,
+                                   Type *Ty, unsigned AS = 0) const {
     // Default: assume that any scaling factor used in a legal AM is free.
-    if (isLegalAddressingMode(AM, Ty, AS))
+    if (isLegalAddressingMode(DL, AM, Ty, AS))
       return 0;
     return -1;
   }
@@ -1734,9 +1738,6 @@ public:
 private:
   const TargetMachine &TM;
 
-  /// True if this is a little endian target.
-  bool IsLittleEndian;
-
   /// Tells the code generator not to expand operations into sequences that use
   /// the select operations if possible.
   bool SelectIsExpensive;
@@ -2414,6 +2415,7 @@ public:
     ArgListTy &getArgs() {
       return Args;
     }
+
   };
 
   /// This function lowers an abstract call to a function into an actual call.
@@ -2485,7 +2487,8 @@ public:
   /// Return the register ID of the name passed in. Used by named register
   /// global variables extension. There is no target-independent behaviour
   /// so the default action is to bail.
-  virtual unsigned getRegisterByName(const char* RegName, EVT VT) const {
+  virtual unsigned getRegisterByName(const char* RegName, EVT VT,
+                                     SelectionDAG &DAG) const {
     report_fatal_error("Named registers not implemented for this target");
   }
 
@@ -2657,7 +2660,8 @@ public:
   /// specific constraints and their prefixes, and also tie in the associated
   /// operand values.  If this returns an empty vector, and if the constraint
   /// string itself isn't empty, there was an error parsing.
-  virtual AsmOperandInfoVector ParseConstraints(const TargetRegisterInfo *TRI,
+  virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL,
+                                                const TargetRegisterInfo *TRI,
                                                 ImmutableCallSite CS) const;
 
   /// Examine constraint type and operand type and determine a weight value.
@@ -2679,7 +2683,7 @@ public:
                                       SelectionDAG *DAG = nullptr) const;
 
   /// Given a constraint, return the type of constraint it is for this target.
-  virtual ConstraintType getConstraintType(const std::string &Constraint) const;
+  virtual ConstraintType getConstraintType(StringRef Constraint) const;
 
   /// Given a physical register constraint (e.g.  {edx}), return the register
   /// number and the register class for the register.
@@ -2692,10 +2696,9 @@ public:
   /// returns a register number of 0 and a null register class pointer.
   virtual std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                               const std::string &Constraint, MVT VT) const;
+                               StringRef Constraint, MVT VT) const;
 
-  virtual unsigned
-  getInlineAsmMemConstraint(const std::string &ConstraintCode) const {
+  virtual unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const {
     if (ConstraintCode == "i")
       return InlineAsm::Constraint_i;
     else if (ConstraintCode == "m")
@@ -2823,9 +2826,9 @@ public:
 /// Given an LLVM IR type and return type attributes, compute the return value
 /// EVTs and flags, and optionally also the offsets, if the return value is
 /// being lowered to memory.
-void GetReturnInfo(Type* ReturnType, AttributeSet attr,
+void GetReturnInfo(Type *ReturnType, AttributeSet attr,
                    SmallVectorImpl<ISD::OutputArg> &Outs,
-                   const TargetLowering &TLI);
+                   const TargetLowering &TLI, const DataLayout &DL);
 
 } // end llvm namespace
 
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index 64a923b80edf..06a2b13836ed 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -212,8 +212,8 @@ public:
   /// supported, or false on success.
   virtual bool addPassesToEmitFile(
       PassManagerBase &, raw_pwrite_stream &, CodeGenFileType,
-      bool /*DisableVerify*/ = true, AnalysisID /*StartAfter*/ = nullptr,
-      AnalysisID /*StopAfter*/ = nullptr,
+      bool /*DisableVerify*/ = true, AnalysisID /*StartBefore*/ = nullptr,
+      AnalysisID /*StartAfter*/ = nullptr, AnalysisID /*StopAfter*/ = nullptr,
       MachineFunctionInitializer * /*MFInitializer*/ = nullptr) {
     return true;
   }
@@ -260,8 +260,8 @@ public:
   /// emitted.  Typically this will involve several steps of code generation.
   bool addPassesToEmitFile(
       PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType,
-      bool DisableVerify = true, AnalysisID StartAfter = nullptr,
-      AnalysisID StopAfter = nullptr,
+      bool DisableVerify = true, AnalysisID StartBefore = nullptr,
+      AnalysisID StartAfter = nullptr, AnalysisID StopAfter = nullptr,
       MachineFunctionInitializer *MFInitializer = nullptr) override;
 
   /// Add passes to the specified pass manager to get machine code emitted with
diff --git a/include/llvm/Target/TargetOpcodes.h b/include/llvm/Target/TargetOpcodes.h
index 1f9a5d4ecaf0..50197191109d 100644
--- a/include/llvm/Target/TargetOpcodes.h
+++ b/include/llvm/Target/TargetOpcodes.h
@@ -118,10 +118,10 @@ enum {
   /// collectors and deoptimizations in either the callee or caller.
   STATEPOINT = 20,
 
-  /// Instruction that records the offset of a function's frame allocation in a
-  /// label. Created by the llvm.frameallocate intrinsic. It has two arguments:
-  /// the symbol for the label and the frame index of the stack allocation.
-  FRAME_ALLOC = 21,
+  /// Instruction that records the offset of a local stack allocation passed to
+  /// llvm.localescape. It has two arguments: the symbol for the label and the
+  /// frame index of the local stack allocation.
+  LOCAL_ESCAPE = 21,
 
   /// Loading instruction that may page fault, bundled with associated
   /// information on how to handle such a page fault.  It is intended to support
diff --git a/include/llvm/Target/TargetSelectionDAGInfo.h b/include/llvm/Target/TargetSelectionDAGInfo.h
index bacdd950705b..53db5aa84292 100644
--- a/include/llvm/Target/TargetSelectionDAGInfo.h
+++ b/include/llvm/Target/TargetSelectionDAGInfo.h
@@ -20,8 +20,6 @@
 
 namespace llvm {
 
-class DataLayout;
-
 //===----------------------------------------------------------------------===//
 /// TargetSelectionDAGInfo - Targets can subclass this to parameterize the
 /// SelectionDAG lowering and instruction selection process.
@@ -30,13 +28,8 @@ class TargetSelectionDAGInfo {
   TargetSelectionDAGInfo(const TargetSelectionDAGInfo &) = delete;
   void operator=(const TargetSelectionDAGInfo &) = delete;
 
-  const DataLayout *DL;
-
-protected:
-  const DataLayout *getDataLayout() const { return DL; }
-
 public:
-  explicit TargetSelectionDAGInfo(const DataLayout *DL);
+  explicit TargetSelectionDAGInfo() = default;
   virtual ~TargetSelectionDAGInfo();
 
   /// EmitTargetCodeForMemcpy - Emit target-specific code that performs a
diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index e42c56add7b1..07c0c66bfa18 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -44,9 +44,17 @@ template <typename T> class SmallVectorImpl;
 class TargetSubtargetInfo : public MCSubtargetInfo {
   TargetSubtargetInfo(const TargetSubtargetInfo &) = delete;
   void operator=(const TargetSubtargetInfo &) = delete;
+  TargetSubtargetInfo() = delete;
 
 protected: // Can only create subclasses...
-  TargetSubtargetInfo();
+  TargetSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS,
+                      ArrayRef<SubtargetFeatureKV> PF,
+                      ArrayRef<SubtargetFeatureKV> PD,
+                      const SubtargetInfoKV *ProcSched,
+                      const MCWriteProcResEntry *WPR,
+                      const MCWriteLatencyEntry *WL,
+                      const MCReadAdvanceEntry *RA, const InstrStage *IS,
+                      const unsigned *OC, const unsigned *FP);
 
 public:
   // AntiDepBreakMode - Type of anti-dependence breaking that should
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index fbd999cbc946..2ea47301bb4c 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -71,6 +71,12 @@ ModulePass *createGlobalOptimizerPass();
 ModulePass *createGlobalDCEPass();
 
 //===----------------------------------------------------------------------===//
+/// This transform is designed to eliminate available external globals
+/// (functions or global variables)
+///
+ModulePass *createEliminateAvailableExternallyPass();
+
+//===----------------------------------------------------------------------===//
 /// createGVExtractionPass - If deleteFn is true, this pass deletes
 /// the specified global values. Otherwise, it deletes as much of the module as
 /// possible, except for the global values specified.
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 5d574ae0bf0f..1334dd0da23c 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -121,6 +121,7 @@ public:
   bool VerifyInput;
   bool VerifyOutput;
   bool MergeFunctions;
+  bool PrepareForLTO;
 
 private:
   /// ExtensionList - This is list of all of the extensions that are registered.
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index cb187ec103d0..2caa9a2462df 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -45,6 +45,7 @@ class LoopInfo;
 class AllocaInst;
 class AliasAnalysis;
 class AssumptionCacheTracker;
+class DominatorTree;
 
 /// CloneModule - Return an exact copy of the specified module
 ///
@@ -233,6 +234,21 @@ bool InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI,
 bool InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
                     bool InsertLifetime = true);
 
+/// \brief Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
+/// Blocks.
+///
+/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
+/// \p LoopDomBB.  Insert the new blocks before block specified in \p Before.
+Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
+                             Loop *OrigLoop, ValueToValueMapTy &VMap,
+                             const Twine &NameSuffix, LoopInfo *LI,
+                             DominatorTree *DT,
+                             SmallVectorImpl<BasicBlock *> &Blocks);
+
+/// \brief Remaps instructions in \p Blocks using the mapping in \p VMap.
+void remapInstructionsInBlocks(const SmallVectorImpl<BasicBlock *> &Blocks,
+                               ValueToValueMapTy &VMap);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/LoopVersioning.h b/include/llvm/Transforms/Utils/LoopVersioning.h
new file mode 100644
index 000000000000..009fba48c6a3
--- /dev/null
+++ b/include/llvm/Transforms/Utils/LoopVersioning.h
@@ -0,0 +1,100 @@
+//===- LoopVersioning.h - Utility to version a loop -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a utility class to perform loop versioning.  The versioned
+// loop speculates that otherwise may-aliasing memory accesses don't overlap and
+// emits checks to prove this.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_LOOPVERSIONING_H
+#define LLVM_TRANSFORMS_UTILS_LOOPVERSIONING_H
+
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+namespace llvm {
+
+class Loop;
+class LoopAccessInfo;
+class LoopInfo;
+
+/// \brief This class emits a version of the loop where run-time checks ensure
+/// that may-alias pointers can't overlap.
+///
+/// It currently only supports single-exit loops and assumes that the loop
+/// already has a preheader.
+class LoopVersioning {
+public:
+  LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
+                 DominatorTree *DT,
+                 const SmallVector<int, 8> *PtrToPartition = nullptr);
+
+  /// \brief Returns true if we need memchecks to disambiguate may-aliasing
+  /// accesses.
+  bool needsRuntimeChecks() const;
+
+  /// \brief Performs the CFG manipulation part of versioning the loop including
+  /// the DominatorTree and LoopInfo updates.
+  ///
+  /// The loop that was used to construct the class will be the "versioned" loop
+  /// i.e. the loop that will receive control if all the memchecks pass.
+  ///
+  /// This allows the loop transform pass to operate on the same loop regardless
+  /// of whether versioning was necessary or not:
+  ///
+  ///    for each loop L:
+  ///        analyze L
+  ///        if versioning is necessary version L
+  ///        transform L
+  void versionLoop(Pass *P);
+
+  /// \brief Adds the necessary PHI nodes for the versioned loops based on the
+  /// loop-defined values used outside of the loop.
+  ///
+  /// This needs to be called after versionLoop if there are defs in the loop
+  /// that are used outside the loop.  FIXME: this should be invoked internally
+  /// by versionLoop and made private.
+  void addPHINodes(const SmallVectorImpl<Instruction *> &DefsUsedOutside);
+
+  /// \brief Returns the versioned loop.  Control flows here if pointers in the
+  /// loop don't alias (i.e. all memchecks passed).  (This loop is actually the
+  /// same as the original loop that we got constructed with.)
+  Loop *getVersionedLoop() { return VersionedLoop; }
+
+  /// \brief Returns the fall-back loop.  Control flows here if pointers in the
+  /// loop may alias (i.e. one of the memchecks failed).
+  Loop *getNonVersionedLoop() { return NonVersionedLoop; }
+
+private:
+  /// \brief The original loop.  This becomes the "versioned" one.  I.e.,
+  /// control flows here if pointers in the loop don't alias.
+  Loop *VersionedLoop;
+  /// \brief The fall-back loop.  I.e. control flows here if pointers in the
+  /// loop may alias (memchecks failed).
+  Loop *NonVersionedLoop;
+
+  /// \brief For each memory pointer it contains the partitionId it is used in.
+  /// If nullptr, no partitioning is used.
+  ///
+  /// The I-th entry corresponds to I-th entry in LAI.getRuntimePointerCheck().
+  /// If the pointer is used in multiple partitions the entry is set to -1.
+  const SmallVector<int, 8> *PtrToPartition;
+
+  /// \brief This maps the instructions from VersionedLoop to their counterpart
+  /// in NonVersionedLoop.
+  ValueToValueMapTy VMap;
+
+  /// \brief Analyses used.
+  const LoopAccessInfo &LAI;
+  LoopInfo *LI;
+  DominatorTree *DT;
+};
+}
+
+#endif
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index ad0727a0e0e5..44d137dffd22 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -71,11 +71,6 @@ void AliasAnalysis::deleteValue(Value *V) {
   AA->deleteValue(V);
 }
 
-void AliasAnalysis::copyValue(Value *From, Value *To) {
-  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
-  AA->copyValue(From, To);
-}
-
 void AliasAnalysis::addEscapingUse(Use &U) {
   assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
   AA->addEscapingUse(U);
diff --git a/lib/Analysis/AliasDebugger.cpp b/lib/Analysis/AliasDebugger.cpp
index 1ef49fc02fef..e5107b3bc827 100644
--- a/lib/Analysis/AliasDebugger.cpp
+++ b/lib/Analysis/AliasDebugger.cpp
@@ -124,10 +124,6 @@ namespace {
       assert(Vals.find(V) != Vals.end() && "Never seen value in AA before");
       AliasAnalysis::deleteValue(V);
     }
-    void copyValue(Value *From, Value *To) override {
-      Vals.insert(To);
-      AliasAnalysis::copyValue(From, To);
-    }
 
   };
 }
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index bf8cda1ffaec..54d0f4304e1f 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -544,9 +544,6 @@ void AliasSetTracker::deleteValue(Value *PtrVal) {
 // the tracker already knows about a value, it will ignore the request.
 //
 void AliasSetTracker::copyValue(Value *From, Value *To) {
-  // Notify the alias analysis implementation that this value is copied.
-  AA.copyValue(From, To);
-
   // First, look up the PointerRec for this pointer.
   PointerMapType::iterator I = PointerMap.find_as(From);
   if (I == PointerMap.end())
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 8e812252fdfe..68f766edb301 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -685,6 +685,9 @@ BasicAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
   if (CS.onlyReadsMemory())
     Min = OnlyReadsMemory;
 
+  if (CS.onlyAccessesArgMemory())
+    Min = ModRefBehavior(Min & OnlyAccessesArgumentPointees);
+
   // The AliasAnalysis base class has some smarts, lets use them.
   return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min);
 }
@@ -710,6 +713,9 @@ BasicAliasAnalysis::getModRefBehavior(const Function *F) {
   if (F->onlyReadsMemory())
     Min = OnlyReadsMemory;
 
+  if (F->onlyAccessesArgMemory())
+    Min = ModRefBehavior(Min & OnlyAccessesArgumentPointees);
+
   const TargetLibraryInfo &TLI =
       getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   if (isMemsetPattern16(F, TLI))
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 2f4c6a92f9af..02a5aef03223 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1234,6 +1234,8 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
   case Intrinsic::floor:
   case Intrinsic::ceil:
   case Intrinsic::sqrt:
+  case Intrinsic::sin:
+  case Intrinsic::cos:
   case Intrinsic::pow:
   case Intrinsic::powi:
   case Intrinsic::bswap:
@@ -1450,6 +1452,10 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
           return ConstantFoldFP(floor, V, Ty);
         case Intrinsic::ceil:
           return ConstantFoldFP(ceil, V, Ty);
+        case Intrinsic::sin:
+          return ConstantFoldFP(sin, V, Ty);
+        case Intrinsic::cos:
+          return ConstantFoldFP(cos, V, Ty);
       }
 
       if (!TLI)
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index f1ddde252924..18d45dd6a396 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -42,94 +42,111 @@ STATISTIC(NumReadMemFunctions, "Number of functions that only read memory");
 STATISTIC(NumIndirectGlobalVars, "Number of indirect global objects");
 
 namespace {
-  /// FunctionRecord - One instance of this structure is stored for every
-  /// function in the program.  Later, the entries for these functions are
-  /// removed if the function is found to call an external function (in which
-  /// case we know nothing about it.
-  struct FunctionRecord {
-    /// GlobalInfo - Maintain mod/ref info for all of the globals without
-    /// addresses taken that are read or written (transitively) by this
-    /// function.
-    std::map<const GlobalValue*, unsigned> GlobalInfo;
-
-    /// MayReadAnyGlobal - May read global variables, but it is not known which.
-    bool MayReadAnyGlobal;
-
-    unsigned getInfoForGlobal(const GlobalValue *GV) const {
-      unsigned Effect = MayReadAnyGlobal ? AliasAnalysis::Ref : 0;
-      std::map<const GlobalValue*, unsigned>::const_iterator I =
+/// FunctionRecord - One instance of this structure is stored for every
+/// function in the program.  Later, the entries for these functions are
+/// removed if the function is found to call an external function (in which
+/// case we know nothing about it.
+struct FunctionRecord {
+  /// GlobalInfo - Maintain mod/ref info for all of the globals without
+  /// addresses taken that are read or written (transitively) by this
+  /// function.
+  std::map<const GlobalValue *, unsigned> GlobalInfo;
+
+  /// MayReadAnyGlobal - May read global variables, but it is not known which.
+  bool MayReadAnyGlobal;
+
+  unsigned getInfoForGlobal(const GlobalValue *GV) const {
+    unsigned Effect = MayReadAnyGlobal ? AliasAnalysis::Ref : 0;
+    std::map<const GlobalValue *, unsigned>::const_iterator I =
         GlobalInfo.find(GV);
-      if (I != GlobalInfo.end())
-        Effect |= I->second;
-      return Effect;
-    }
+    if (I != GlobalInfo.end())
+      Effect |= I->second;
+    return Effect;
+  }
 
-    /// FunctionEffect - Capture whether or not this function reads or writes to
-    /// ANY memory.  If not, we can do a lot of aggressive analysis on it.
-    unsigned FunctionEffect;
+  /// FunctionEffect - Capture whether or not this function reads or writes to
+  /// ANY memory.  If not, we can do a lot of aggressive analysis on it.
+  unsigned FunctionEffect;
 
-    FunctionRecord() : MayReadAnyGlobal (false), FunctionEffect(0) {}
-  };
+  FunctionRecord() : MayReadAnyGlobal(false), FunctionEffect(0) {}
+};
 
-  /// GlobalsModRef - The actual analysis pass.
-  class GlobalsModRef : public ModulePass, public AliasAnalysis {
-    /// NonAddressTakenGlobals - The globals that do not have their addresses
-    /// taken.
-    std::set<const GlobalValue*> NonAddressTakenGlobals;
+/// GlobalsModRef - The actual analysis pass.
+class GlobalsModRef : public ModulePass, public AliasAnalysis {
+  /// NonAddressTakenGlobals - The globals that do not have their addresses
+  /// taken.
+  std::set<const GlobalValue *> NonAddressTakenGlobals;
 
-    /// IndirectGlobals - The memory pointed to by this global is known to be
-    /// 'owned' by the global.
-    std::set<const GlobalValue*> IndirectGlobals;
+  /// IndirectGlobals - The memory pointed to by this global is known to be
+  /// 'owned' by the global.
+  std::set<const GlobalValue *> IndirectGlobals;
 
-    /// AllocsForIndirectGlobals - If an instruction allocates memory for an
-    /// indirect global, this map indicates which one.
-    std::map<const Value*, const GlobalValue*> AllocsForIndirectGlobals;
+  /// AllocsForIndirectGlobals - If an instruction allocates memory for an
+  /// indirect global, this map indicates which one.
+  std::map<const Value *, const GlobalValue *> AllocsForIndirectGlobals;
 
-    /// FunctionInfo - For each function, keep track of what globals are
-    /// modified or read.
-    std::map<const Function*, FunctionRecord> FunctionInfo;
+  /// FunctionInfo - For each function, keep track of what globals are
+  /// modified or read.
+  std::map<const Function *, FunctionRecord> FunctionInfo;
 
-  public:
-    static char ID;
-    GlobalsModRef() : ModulePass(ID) {
-      initializeGlobalsModRefPass(*PassRegistry::getPassRegistry());
-    }
+public:
+  static char ID;
+  GlobalsModRef() : ModulePass(ID) {
+    initializeGlobalsModRefPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool runOnModule(Module &M) override {
-      InitializeAliasAnalysis(this, &M.getDataLayout());
+  bool runOnModule(Module &M) override {
+    InitializeAliasAnalysis(this, &M.getDataLayout());
 
-      // Find non-addr taken globals.
-      AnalyzeGlobals(M);
+    // Find non-addr taken globals.
+    AnalyzeGlobals(M);
 
-      // Propagate on CG.
-      AnalyzeCallGraph(getAnalysis<CallGraphWrapperPass>().getCallGraph(), M);
-      return false;
-    }
+    // Propagate on CG.
+    AnalyzeCallGraph(getAnalysis<CallGraphWrapperPass>().getCallGraph(), M);
+    return false;
+  }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AliasAnalysis::getAnalysisUsage(AU);
-      AU.addRequired<CallGraphWrapperPass>();
-      AU.setPreservesAll();                         // Does not transform code
-    }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AliasAnalysis::getAnalysisUsage(AU);
+    AU.addRequired<CallGraphWrapperPass>();
+    AU.setPreservesAll(); // Does not transform code
+  }
+
+  //------------------------------------------------
+  // Implement the AliasAnalysis API
+  //
+  AliasResult alias(const MemoryLocation &LocA,
+                    const MemoryLocation &LocB) override;
+  ModRefResult getModRefInfo(ImmutableCallSite CS,
+                             const MemoryLocation &Loc) override;
+  ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                             ImmutableCallSite CS2) override {
+    return AliasAnalysis::getModRefInfo(CS1, CS2);
+  }
 
-    //------------------------------------------------
-    // Implement the AliasAnalysis API
-    //
-    AliasResult alias(const MemoryLocation &LocA,
-                      const MemoryLocation &LocB) override;
-    ModRefResult getModRefInfo(ImmutableCallSite CS,
-                               const MemoryLocation &Loc) override;
-    ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                               ImmutableCallSite CS2) override {
-      return AliasAnalysis::getModRefInfo(CS1, CS2);
+  /// getModRefBehavior - Return the behavior of the specified function if
+  /// called from the specified call site.  The call site may be null in which
+  /// case the most generic behavior of this function should be returned.
+  ModRefBehavior getModRefBehavior(const Function *F) override {
+    ModRefBehavior Min = UnknownModRefBehavior;
+
+    if (FunctionRecord *FR = getFunctionInfo(F)) {
+      if (FR->FunctionEffect == 0)
+        Min = DoesNotAccessMemory;
+      else if ((FR->FunctionEffect & Mod) == 0)
+        Min = OnlyReadsMemory;
     }
 
-    /// getModRefBehavior - Return the behavior of the specified function if
-    /// called from the specified call site.  The call site may be null in which
-    /// case the most generic behavior of this function should be returned.
-    ModRefBehavior getModRefBehavior(const Function *F) override {
-      ModRefBehavior Min = UnknownModRefBehavior;
+    return ModRefBehavior(AliasAnalysis::getModRefBehavior(F) & Min);
+  }
+
+  /// getModRefBehavior - Return the behavior of the specified function if
+  /// called from the specified call site.  The call site may be null in which
+  /// case the most generic behavior of this function should be returned.
+  ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override {
+    ModRefBehavior Min = UnknownModRefBehavior;
 
+    if (const Function *F = CS.getCalledFunction())
       if (FunctionRecord *FR = getFunctionInfo(F)) {
         if (FR->FunctionEffect == 0)
           Min = DoesNotAccessMemory;
@@ -137,68 +154,50 @@ namespace {
           Min = OnlyReadsMemory;
       }
 
-      return ModRefBehavior(AliasAnalysis::getModRefBehavior(F) & Min);
-    }
-    
-    /// getModRefBehavior - Return the behavior of the specified function if
-    /// called from the specified call site.  The call site may be null in which
-    /// case the most generic behavior of this function should be returned.
-    ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override {
-      ModRefBehavior Min = UnknownModRefBehavior;
-
-      if (const Function* F = CS.getCalledFunction())
-        if (FunctionRecord *FR = getFunctionInfo(F)) {
-          if (FR->FunctionEffect == 0)
-            Min = DoesNotAccessMemory;
-          else if ((FR->FunctionEffect & Mod) == 0)
-            Min = OnlyReadsMemory;
-        }
+    return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min);
+  }
 
-      return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min);
-    }
+  void deleteValue(Value *V) override;
+  void addEscapingUse(Use &U) override;
+
+  /// getAdjustedAnalysisPointer - This method is used when a pass implements
+  /// an analysis interface through multiple inheritance.  If needed, it
+  /// should override this to adjust the this pointer as needed for the
+  /// specified pass info.
+  void *getAdjustedAnalysisPointer(AnalysisID PI) override {
+    if (PI == &AliasAnalysis::ID)
+      return (AliasAnalysis *)this;
+    return this;
+  }
 
-    void deleteValue(Value *V) override;
-    void copyValue(Value *From, Value *To) override;
-    void addEscapingUse(Use &U) override;
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    void *getAdjustedAnalysisPointer(AnalysisID PI) override {
-      if (PI == &AliasAnalysis::ID)
-        return (AliasAnalysis*)this;
-      return this;
-    }
-    
-  private:
-    /// getFunctionInfo - Return the function info for the function, or null if
-    /// we don't have anything useful to say about it.
-    FunctionRecord *getFunctionInfo(const Function *F) {
-      std::map<const Function*, FunctionRecord>::iterator I =
+private:
+  /// getFunctionInfo - Return the function info for the function, or null if
+  /// we don't have anything useful to say about it.
+  FunctionRecord *getFunctionInfo(const Function *F) {
+    std::map<const Function *, FunctionRecord>::iterator I =
         FunctionInfo.find(F);
-      if (I != FunctionInfo.end())
-        return &I->second;
-      return nullptr;
-    }
+    if (I != FunctionInfo.end())
+      return &I->second;
+    return nullptr;
+  }
 
-    void AnalyzeGlobals(Module &M);
-    void AnalyzeCallGraph(CallGraph &CG, Module &M);
-    bool AnalyzeUsesOfPointer(Value *V, std::vector<Function*> &Readers,
-                              std::vector<Function*> &Writers,
-                              GlobalValue *OkayStoreDest = nullptr);
-    bool AnalyzeIndirectGlobalMemory(GlobalValue *GV);
-  };
+  void AnalyzeGlobals(Module &M);
+  void AnalyzeCallGraph(CallGraph &CG, Module &M);
+  bool AnalyzeUsesOfPointer(Value *V, std::vector<Function *> &Readers,
+                            std::vector<Function *> &Writers,
+                            GlobalValue *OkayStoreDest = nullptr);
+  bool AnalyzeIndirectGlobalMemory(GlobalValue *GV);
+};
 }
 
 char GlobalsModRef::ID = 0;
-INITIALIZE_AG_PASS_BEGIN(GlobalsModRef, AliasAnalysis,
-                "globalsmodref-aa", "Simple mod/ref analysis for globals",    
-                false, true, false)
+INITIALIZE_AG_PASS_BEGIN(GlobalsModRef, AliasAnalysis, "globalsmodref-aa",
+                         "Simple mod/ref analysis for globals", false, true,
+                         false)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_AG_PASS_END(GlobalsModRef, AliasAnalysis,
-                "globalsmodref-aa", "Simple mod/ref analysis for globals",    
-                false, true, false)
+INITIALIZE_AG_PASS_END(GlobalsModRef, AliasAnalysis, "globalsmodref-aa",
+                       "Simple mod/ref analysis for globals", false, true,
+                       false)
 
 Pass *llvm::createGlobalsModRefPass() { return new GlobalsModRef(); }
 
@@ -207,7 +206,7 @@ Pass *llvm::createGlobalsModRefPass() { return new GlobalsModRef(); }
 /// (really, their address passed to something nontrivial), record this fact,
 /// and record the functions that they are used directly in.
 void GlobalsModRef::AnalyzeGlobals(Module &M) {
-  std::vector<Function*> Readers, Writers;
+  std::vector<Function *> Readers, Writers;
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
     if (I->hasLocalLinkage()) {
       if (!AnalyzeUsesOfPointer(I, Readers, Writers)) {
@@ -215,11 +214,12 @@ void GlobalsModRef::AnalyzeGlobals(Module &M) {
         NonAddressTakenGlobals.insert(I);
         ++NumNonAddrTakenFunctions;
       }
-      Readers.clear(); Writers.clear();
+      Readers.clear();
+      Writers.clear();
     }
 
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;
+       ++I)
     if (I->hasLocalLinkage()) {
       if (!AnalyzeUsesOfPointer(I, Readers, Writers)) {
         // Remember that we are tracking this global, and the mod/ref fns
@@ -228,7 +228,7 @@ void GlobalsModRef::AnalyzeGlobals(Module &M) {
         for (unsigned i = 0, e = Readers.size(); i != e; ++i)
           FunctionInfo[Readers[i]].GlobalInfo[I] |= Ref;
 
-        if (!I->isConstant())  // No need to keep track of writers to constants
+        if (!I->isConstant()) // No need to keep track of writers to constants
           for (unsigned i = 0, e = Writers.size(); i != e; ++i)
             FunctionInfo[Writers[i]].GlobalInfo[I] |= Mod;
         ++NumNonAddrTakenGlobalVars;
@@ -238,7 +238,8 @@ void GlobalsModRef::AnalyzeGlobals(Module &M) {
             AnalyzeIndirectGlobalMemory(I))
           ++NumIndirectGlobalVars;
       }
-      Readers.clear(); Writers.clear();
+      Readers.clear();
+      Writers.clear();
     }
 }
 
@@ -249,10 +250,11 @@ void GlobalsModRef::AnalyzeGlobals(Module &M) {
 ///
 /// If OkayStoreDest is non-null, stores into this global are allowed.
 bool GlobalsModRef::AnalyzeUsesOfPointer(Value *V,
-                                         std::vector<Function*> &Readers,
-                                         std::vector<Function*> &Writers,
+                                         std::vector<Function *> &Readers,
+                                         std::vector<Function *> &Writers,
                                          GlobalValue *OkayStoreDest) {
-  if (!V->getType()->isPointerTy()) return true;
+  if (!V->getType()->isPointerTy())
+    return true;
 
   for (Use &U : V->uses()) {
     User *I = U.getUser();
@@ -262,7 +264,7 @@ bool GlobalsModRef::AnalyzeUsesOfPointer(Value *V,
       if (V == SI->getOperand(1)) {
         Writers.push_back(SI->getParent()->getParent());
       } else if (SI->getOperand(1) != OkayStoreDest) {
-        return true;  // Storing the pointer
+        return true; // Storing the pointer
       }
     } else if (Operator::getOpcode(I) == Instruction::GetElementPtr) {
       if (AnalyzeUsesOfPointer(I, Readers, Writers))
@@ -282,7 +284,7 @@ bool GlobalsModRef::AnalyzeUsesOfPointer(Value *V,
       }
     } else if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
       if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
-        return true;  // Allow comparison against null.
+        return true; // Allow comparison against null.
     } else {
       return true;
     }
@@ -301,7 +303,7 @@ bool GlobalsModRef::AnalyzeUsesOfPointer(Value *V,
 bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
   // Keep track of values related to the allocation of the memory, f.e. the
   // value produced by the malloc call and any casts.
-  std::vector<Value*> AllocRelatedValues;
+  std::vector<Value *> AllocRelatedValues;
 
   // Walk the user list of the global.  If we find anything other than a direct
   // load or store, bail out.
@@ -310,13 +312,14 @@ bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
       // The pointer loaded from the global can only be used in simple ways:
       // we allow addressing of it and loading storing to it.  We do *not* allow
       // storing the loaded pointer somewhere else or passing to a function.
-      std::vector<Function*> ReadersWriters;
+      std::vector<Function *> ReadersWriters;
       if (AnalyzeUsesOfPointer(LI, ReadersWriters, ReadersWriters))
-        return false;  // Loaded pointer escapes.
+        return false; // Loaded pointer escapes.
       // TODO: Could try some IP mod/ref of the loaded pointer.
     } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
       // Storing the global itself.
-      if (SI->getOperand(0) == GV) return false;
+      if (SI->getOperand(0) == GV)
+        return false;
 
       // If storing the null pointer, ignore it.
       if (isa<ConstantPointerNull>(SI->getOperand(0)))
@@ -327,13 +330,13 @@ bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
                                        GV->getParent()->getDataLayout());
 
       if (!isAllocLikeFn(Ptr, TLI))
-        return false;  // Too hard to analyze.
+        return false; // Too hard to analyze.
 
       // Analyze all uses of the allocation.  If any of them are used in a
       // non-simple way (e.g. stored to another global) bail out.
-      std::vector<Function*> ReadersWriters;
+      std::vector<Function *> ReadersWriters;
       if (AnalyzeUsesOfPointer(Ptr, ReadersWriters, ReadersWriters, GV))
-        return false;  // Loaded pointer escapes.
+        return false; // Loaded pointer escapes.
 
       // Remember that this allocation is related to the indirect global.
       AllocRelatedValues.push_back(Ptr);
@@ -360,7 +363,7 @@ bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
 void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
   // We do a bottom-up SCC traversal of the call graph.  In other words, we
   // visit all callees before callers (leaf-first).
-  for (scc_iterator<CallGraph*> I = scc_begin(&CG); !I.isAtEnd(); ++I) {
+  for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) {
     const std::vector<CallGraphNode *> &SCC = *I;
     assert(!SCC.empty() && "SCC with no functions?");
 
@@ -437,9 +440,10 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
     }
 
     // Scan the function bodies for explicit loads or stores.
-    for (unsigned i = 0, e = SCC.size(); i != e && FunctionEffect != ModRef;++i)
+    for (unsigned i = 0, e = SCC.size(); i != e && FunctionEffect != ModRef;
+         ++i)
       for (inst_iterator II = inst_begin(SCC[i]->getFunction()),
-             E = inst_end(SCC[i]->getFunction());
+                         E = inst_end(SCC[i]->getFunction());
            II != E && FunctionEffect != ModRef; ++II)
         if (LoadInst *LI = dyn_cast<LoadInst>(&*II)) {
           FunctionEffect |= Ref;
@@ -474,8 +478,6 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
   }
 }
 
-
-
 /// alias - If one of the pointers is to a global that we are tracking, and the
 /// other is some random pointer, we know there cannot be an alias, because the
 /// address of the global isn't taken.
@@ -492,8 +494,10 @@ AliasResult GlobalsModRef::alias(const MemoryLocation &LocA,
   if (GV1 || GV2) {
     // If the global's address is taken, pretend we don't know it's a pointer to
     // the global.
-    if (GV1 && !NonAddressTakenGlobals.count(GV1)) GV1 = nullptr;
-    if (GV2 && !NonAddressTakenGlobals.count(GV2)) GV2 = nullptr;
+    if (GV1 && !NonAddressTakenGlobals.count(GV1))
+      GV1 = nullptr;
+    if (GV2 && !NonAddressTakenGlobals.count(GV2))
+      GV2 = nullptr;
 
     // If the two pointers are derived from two different non-addr-taken
     // globals, or if one is and the other isn't, we know these can't alias.
@@ -554,7 +558,6 @@ GlobalsModRef::getModRefInfo(ImmutableCallSite CS, const MemoryLocation &Loc) {
   return ModRefResult(Known & AliasAnalysis::getModRefInfo(CS, Loc));
 }
 
-
 //===----------------------------------------------------------------------===//
 // Methods to update the analysis as a result of the client transformation.
 //
@@ -565,9 +568,10 @@ void GlobalsModRef::deleteValue(Value *V) {
       // any AllocRelatedValues for it.
       if (IndirectGlobals.erase(GV)) {
         // Remove any entries in AllocsForIndirectGlobals for this global.
-        for (std::map<const Value*, const GlobalValue*>::iterator
-             I = AllocsForIndirectGlobals.begin(),
-             E = AllocsForIndirectGlobals.end(); I != E; ) {
+        for (std::map<const Value *, const GlobalValue *>::iterator
+                 I = AllocsForIndirectGlobals.begin(),
+                 E = AllocsForIndirectGlobals.end();
+             I != E;) {
           if (I->second == GV) {
             AllocsForIndirectGlobals.erase(I++);
           } else {
@@ -585,16 +589,12 @@ void GlobalsModRef::deleteValue(Value *V) {
   AliasAnalysis::deleteValue(V);
 }
 
-void GlobalsModRef::copyValue(Value *From, Value *To) {
-  AliasAnalysis::copyValue(From, To);
-}
-
 void GlobalsModRef::addEscapingUse(Use &U) {
   // For the purposes of this analysis, it is conservatively correct to treat
   // a newly escaping value equivalently to a deleted one.  We could perhaps
   // be more precise by processing the new use and attempting to update our
   // saved analysis results to accommodate it.
   deleteValue(U);
-  
+
   AliasAnalysis::addEscapingUse(U);
 }
diff --git a/lib/Analysis/IPA/InlineCost.cpp b/lib/Analysis/IPA/InlineCost.cpp
index 349b9cac2c2d..c0d2e375cb04 100644
--- a/lib/Analysis/IPA/InlineCost.cpp
+++ b/lib/Analysis/IPA/InlineCost.cpp
@@ -783,7 +783,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
       case Intrinsic::memmove:
         // SROA can usually chew through these intrinsics, but they aren't free.
         return false;
-      case Intrinsic::frameescape:
+      case Intrinsic::localescape:
         HasFrameEscape = true;
         return false;
       }
@@ -1424,11 +1424,11 @@ bool InlineCostAnalysis::isInlineViable(Function &F) {
           cast<CallInst>(CS.getInstruction())->canReturnTwice())
         return false;
 
-      // Disallow inlining functions that call @llvm.frameescape. Doing this
+      // Disallow inlining functions that call @llvm.localescape. Doing this
       // correctly would require major changes to the inliner.
       if (CS.getCalledFunction() &&
           CS.getCalledFunction()->getIntrinsicID() ==
-              llvm::Intrinsic::frameescape)
+              llvm::Intrinsic::localescape)
         return false;
     }
   }
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index b88b2496b875..926787d3be91 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -12,8 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/IVUsers.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -34,6 +36,7 @@ using namespace llvm;
 char IVUsers::ID = 0;
 INITIALIZE_PASS_BEGIN(IVUsers, "iv-users",
                       "Induction Variable Users", false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
@@ -137,6 +140,11 @@ bool IVUsers::AddUsersImpl(Instruction *I,
   if (Width > 64 || !DL.isLegalInteger(Width))
     return false;
 
+  // Don't attempt to promote ephemeral values to indvars. They will be removed
+  // later anyway.
+  if (EphValues.count(I))
+    return false;
+
   // Get the symbolic expression for this instruction.
   const SCEV *ISE = SE->getSCEV(I);
 
@@ -244,6 +252,7 @@ IVUsers::IVUsers()
 }
 
 void IVUsers::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<LoopInfoWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<ScalarEvolution>();
@@ -253,10 +262,16 @@ void IVUsers::getAnalysisUsage(AnalysisUsage &AU) const {
 bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
 
   L = l;
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+      *L->getHeader()->getParent());
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   SE = &getAnalysis<ScalarEvolution>();
 
+  // Collect ephemeral values so that AddUsersIfInteresting skips them.
+  EphValues.clear();
+  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
   // Find all uses of induction variables in this loop, and categorize
   // them by stride.  Start by finding all of the PHI nodes in the header for
   // this loop.  If they are induction variables, inspect their uses.
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 12e406bb1a2d..fa42b48b6cdb 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -3046,7 +3047,8 @@ Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 /// SimplifyFCmpInst - Given operands for an FCmpInst, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                               const Query &Q, unsigned MaxRecurse) {
+                               FastMathFlags FMF, const Query &Q,
+                               unsigned MaxRecurse) {
   CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
   assert(CmpInst::isFPPredicate(Pred) && "Not an FP compare!");
 
@@ -3065,6 +3067,14 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   if (Pred == FCmpInst::FCMP_TRUE)
     return ConstantInt::get(GetCompareTy(LHS), 1);
 
+  // UNO/ORD predicates can be trivially folded if NaNs are ignored.
+  if (FMF.noNaNs()) {
+    if (Pred == FCmpInst::FCMP_UNO)
+      return ConstantInt::get(GetCompareTy(LHS), 0);
+    if (Pred == FCmpInst::FCMP_ORD)
+      return ConstantInt::get(GetCompareTy(LHS), 1);
+  }
+
   // fcmp pred x, undef  and  fcmp pred undef, x
   // fold to true if unordered, false if ordered
   if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS)) {
@@ -3151,12 +3161,12 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 }
 
 Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                              const DataLayout &DL,
+                              FastMathFlags FMF, const DataLayout &DL,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT, AssumptionCache *AC,
                               const Instruction *CxtI) {
-  return ::SimplifyFCmpInst(Predicate, LHS, RHS, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+  return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF,
+                            Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
 }
 
 /// SimplifyWithOpReplaced - See if V simplifies when its operand Op is
@@ -3511,6 +3521,82 @@ Value *llvm::SimplifyInsertValueInst(
                                    RecursionLimit);
 }
 
+/// SimplifyExtractValueInst - Given operands for an ExtractValueInst, see if we
+/// can fold the result.  If not, this returns null.
+static Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
+                                       const Query &, unsigned) {
+  if (auto *CAgg = dyn_cast<Constant>(Agg))
+    return ConstantFoldExtractValueInstruction(CAgg, Idxs);
+
+  // extractvalue x, (insertvalue y, elt, n), n -> elt
+  unsigned NumIdxs = Idxs.size();
+  for (auto *IVI = dyn_cast<InsertValueInst>(Agg); IVI != nullptr;
+       IVI = dyn_cast<InsertValueInst>(IVI->getAggregateOperand())) {
+    ArrayRef<unsigned> InsertValueIdxs = IVI->getIndices();
+    unsigned NumInsertValueIdxs = InsertValueIdxs.size();
+    unsigned NumCommonIdxs = std::min(NumInsertValueIdxs, NumIdxs);
+    if (InsertValueIdxs.slice(0, NumCommonIdxs) ==
+        Idxs.slice(0, NumCommonIdxs)) {
+      if (NumIdxs == NumInsertValueIdxs)
+        return IVI->getInsertedValueOperand();
+      break;
+    }
+  }
+
+  return nullptr;
+}
+
+Value *llvm::SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
+                                      const DataLayout &DL,
+                                      const TargetLibraryInfo *TLI,
+                                      const DominatorTree *DT,
+                                      AssumptionCache *AC,
+                                      const Instruction *CxtI) {
+  return ::SimplifyExtractValueInst(Agg, Idxs, Query(DL, TLI, DT, AC, CxtI),
+                                    RecursionLimit);
+}
+
+/// SimplifyExtractElementInst - Given operands for an ExtractElementInst, see if we
+/// can fold the result.  If not, this returns null.
+static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const Query &,
+                                         unsigned) {
+  if (auto *CVec = dyn_cast<Constant>(Vec)) {
+    if (auto *CIdx = dyn_cast<Constant>(Idx))
+      return ConstantFoldExtractElementInstruction(CVec, CIdx);
+
+    // The index is not relevant if our vector is a splat.
+    if (auto *Splat = CVec->getSplatValue())
+      return Splat;
+
+    if (isa<UndefValue>(Vec))
+      return UndefValue::get(Vec->getType()->getVectorElementType());
+  }
+
+  // If extracting a specified index from the vector, see if we can recursively
+  // find a previously computed scalar that was inserted into the vector.
+  if (auto *IdxC = dyn_cast<ConstantInt>(Idx)) {
+    unsigned IndexVal = IdxC->getZExtValue();
+    unsigned VectorWidth = Vec->getType()->getVectorNumElements();
+
+    // If this is extracting an invalid index, turn this into undef, to avoid
+    // crashing the code below.
+    if (IndexVal >= VectorWidth)
+      return UndefValue::get(Vec->getType()->getVectorElementType());
+
+    if (Value *Elt = findScalarElement(Vec, IndexVal))
+      return Elt;
+  }
+
+  return nullptr;
+}
+
+Value *llvm::SimplifyExtractElementInst(
+    Value *Vec, Value *Idx, const DataLayout &DL, const TargetLibraryInfo *TLI,
+    const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) {
+  return ::SimplifyExtractElementInst(Vec, Idx, Query(DL, TLI, DT, AC, CxtI),
+                                      RecursionLimit);
+}
+
 /// SimplifyPHINode - See if we can fold the given phi.  If not, returns null.
 static Value *SimplifyPHINode(PHINode *PN, const Query &Q) {
   // If all of the PHI's incoming values are the same then replace the PHI node
@@ -3670,7 +3756,7 @@ static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const Query &Q, unsigned MaxRecurse) {
   if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate))
     return SimplifyICmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
-  return SimplifyFCmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
+  return SimplifyFCmpInst(Predicate, LHS, RHS, FastMathFlags(), Q, MaxRecurse);
 }
 
 Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
@@ -3900,9 +3986,9 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
                          I->getOperand(1), DL, TLI, DT, AC, I);
     break;
   case Instruction::FCmp:
-    Result =
-        SimplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(), I->getOperand(0),
-                         I->getOperand(1), DL, TLI, DT, AC, I);
+    Result = SimplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(),
+                              I->getOperand(0), I->getOperand(1),
+                              I->getFastMathFlags(), DL, TLI, DT, AC, I);
     break;
   case Instruction::Select:
     Result = SimplifySelectInst(I->getOperand(0), I->getOperand(1),
@@ -3920,6 +4006,18 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
                                      IV->getIndices(), DL, TLI, DT, AC, I);
     break;
   }
+  case Instruction::ExtractValue: {
+    auto *EVI = cast<ExtractValueInst>(I);
+    Result = SimplifyExtractValueInst(EVI->getAggregateOperand(),
+                                      EVI->getIndices(), DL, TLI, DT, AC, I);
+    break;
+  }
+  case Instruction::ExtractElement: {
+    auto *EEI = cast<ExtractElementInst>(I);
+    Result = SimplifyExtractElementInst(
+        EEI->getVectorOperand(), EEI->getIndexOperand(), DL, TLI, DT, AC, I);
+    break;
+  }
   case Instruction::PHI:
     Result = SimplifyPHINode(cast<PHINode>(I), Query(DL, TLI, DT, AC, I));
     break;
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index b11cd7e84a6d..becbae4c5b50 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -48,6 +48,13 @@ static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
     cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8));
 unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
 
+/// \brief The maximum iterations used to merge memory checks
+static cl::opt<unsigned> MemoryCheckMergeThreshold(
+    "memory-check-merge-threshold", cl::Hidden,
+    cl::desc("Maximum number of comparisons done when trying to merge "
+             "runtime memory checks. (default = 100)"),
+    cl::init(100));
+
 /// Maximum SIMD width.
 const unsigned VectorizerParams::MaxVectorWidth = 64;
 
@@ -112,35 +119,182 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
   return SE->getSCEV(Ptr);
 }
 
-void LoopAccessInfo::RuntimePointerCheck::insert(
-    ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
-    unsigned ASId, const ValueToValueMap &Strides) {
+void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
+                                    unsigned DepSetId, unsigned ASId,
+                                    const ValueToValueMap &Strides) {
   // Get the stride replaced scev.
   const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
   assert(AR && "Invalid addrec expression");
   const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
   const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
-  Pointers.push_back(Ptr);
-  Starts.push_back(AR->getStart());
-  Ends.push_back(ScEnd);
-  IsWritePtr.push_back(WritePtr);
-  DependencySetId.push_back(DepSetId);
-  AliasSetId.push_back(ASId);
+  Pointers.emplace_back(Ptr, AR->getStart(), ScEnd, WritePtr, DepSetId, ASId,
+                        Sc);
+}
+
+bool RuntimePointerChecking::needsChecking(
+    const CheckingPtrGroup &M, const CheckingPtrGroup &N,
+    const SmallVectorImpl<int> *PtrPartition) const {
+  for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
+    for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
+      if (needsChecking(M.Members[I], N.Members[J], PtrPartition))
+        return true;
+  return false;
+}
+
+/// Compare \p I and \p J and return the minimum.
+/// Return nullptr in case we couldn't find an answer.
+static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
+                                   ScalarEvolution *SE) {
+  const SCEV *Diff = SE->getMinusSCEV(J, I);
+  const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
+
+  if (!C)
+    return nullptr;
+  if (C->getValue()->isNegative())
+    return J;
+  return I;
+}
+
+bool RuntimePointerChecking::CheckingPtrGroup::addPointer(unsigned Index) {
+  const SCEV *Start = RtCheck.Pointers[Index].Start;
+  const SCEV *End = RtCheck.Pointers[Index].End;
+
+  // Compare the starts and ends with the known minimum and maximum
+  // of this set. We need to know how we compare against the min/max
+  // of the set in order to be able to emit memchecks.
+  const SCEV *Min0 = getMinFromExprs(Start, Low, RtCheck.SE);
+  if (!Min0)
+    return false;
+
+  const SCEV *Min1 = getMinFromExprs(End, High, RtCheck.SE);
+  if (!Min1)
+    return false;
+
+  // Update the low bound  expression if we've found a new min value.
+  if (Min0 == Start)
+    Low = Start;
+
+  // Update the high bound expression if we've found a new max value.
+  if (Min1 != End)
+    High = End;
+
+  Members.push_back(Index);
+  return true;
 }
 
-bool LoopAccessInfo::RuntimePointerCheck::needsChecking(
+void RuntimePointerChecking::groupChecks(
+    MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
+  // We build the groups from dependency candidates equivalence classes
+  // because:
+  //    - We know that pointers in the same equivalence class share
+  //      the same underlying object and therefore there is a chance
+  //      that we can compare pointers
+  //    - We wouldn't be able to merge two pointers for which we need
+  //      to emit a memcheck. The classes in DepCands are already
+  //      conveniently built such that no two pointers in the same
+  //      class need checking against each other.
+
+  // We use the following (greedy) algorithm to construct the groups
+  // For every pointer in the equivalence class:
+  //   For each existing group:
+  //   - if the difference between this pointer and the min/max bounds
+  //     of the group is a constant, then make the pointer part of the
+  //     group and update the min/max bounds of that group as required.
+
+  CheckingGroups.clear();
+
+  // If we don't have the dependency partitions, construct a new
+  // checking pointer group for each pointer.
+  if (!UseDependencies) {
+    for (unsigned I = 0; I < Pointers.size(); ++I)
+      CheckingGroups.push_back(CheckingPtrGroup(I, *this));
+    return;
+  }
+
+  unsigned TotalComparisons = 0;
+
+  DenseMap<Value *, unsigned> PositionMap;
+  for (unsigned Index = 0; Index < Pointers.size(); ++Index)
+    PositionMap[Pointers[Index].PointerValue] = Index;
+
+  // We need to keep track of what pointers we've already seen so we
+  // don't process them twice.
+  SmallSet<unsigned, 2> Seen;
+
+  // Go through all equivalence classes, get the the "pointer check groups"
+  // and add them to the overall solution. We use the order in which accesses
+  // appear in 'Pointers' to enforce determinism.
+  for (unsigned I = 0; I < Pointers.size(); ++I) {
+    // We've seen this pointer before, and therefore already processed
+    // its equivalence class.
+    if (Seen.count(I))
+      continue;
+
+    MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
+                                           Pointers[I].IsWritePtr);
+
+    SmallVector<CheckingPtrGroup, 2> Groups;
+    auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
+
+    // Because DepCands is constructed by visiting accesses in the order in
+    // which they appear in alias sets (which is deterministic) and the
+    // iteration order within an equivalence class member is only dependent on
+    // the order in which unions and insertions are performed on the
+    // equivalence class, the iteration order is deterministic.
+    for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
+         MI != ME; ++MI) {
+      unsigned Pointer = PositionMap[MI->getPointer()];
+      bool Merged = false;
+      // Mark this pointer as seen.
+      Seen.insert(Pointer);
+
+      // Go through all the existing sets and see if we can find one
+      // which can include this pointer.
+      for (CheckingPtrGroup &Group : Groups) {
+        // Don't perform more than a certain amount of comparisons.
+        // This should limit the cost of grouping the pointers to something
+        // reasonable.  If we do end up hitting this threshold, the algorithm
+        // will create separate groups for all remaining pointers.
+        if (TotalComparisons > MemoryCheckMergeThreshold)
+          break;
+
+        TotalComparisons++;
+
+        if (Group.addPointer(Pointer)) {
+          Merged = true;
+          break;
+        }
+      }
+
+      if (!Merged)
+        // We couldn't add this pointer to any existing set or the threshold
+        // for the number of comparisons has been reached. Create a new group
+        // to hold the current pointer.
+        Groups.push_back(CheckingPtrGroup(Pointer, *this));
+    }
+
+    // We've computed the grouped checks for this partition.
+    // Save the results and continue with the next one.
+    std::copy(Groups.begin(), Groups.end(), std::back_inserter(CheckingGroups));
+  }
+}
+
+bool RuntimePointerChecking::needsChecking(
     unsigned I, unsigned J, const SmallVectorImpl<int> *PtrPartition) const {
+  const PointerInfo &PointerI = Pointers[I];
+  const PointerInfo &PointerJ = Pointers[J];
+
   // No need to check if two readonly pointers intersect.
-  if (!IsWritePtr[I] && !IsWritePtr[J])
+  if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
     return false;
 
   // Only need to check pointers between two different dependency sets.
-  if (DependencySetId[I] == DependencySetId[J])
+  if (PointerI.DependencySetId == PointerJ.DependencySetId)
     return false;
 
   // Only need to check pointers in the same alias set.
-  if (AliasSetId[I] != AliasSetId[J])
+  if (PointerI.AliasSetId != PointerJ.AliasSetId)
     return false;
 
   // If PtrPartition is set omit checks between pointers of the same partition.
@@ -153,45 +307,75 @@ bool LoopAccessInfo::RuntimePointerCheck::needsChecking(
   return true;
 }
 
-void LoopAccessInfo::RuntimePointerCheck::print(
+void RuntimePointerChecking::print(
     raw_ostream &OS, unsigned Depth,
     const SmallVectorImpl<int> *PtrPartition) const {
-  unsigned NumPointers = Pointers.size();
-  if (NumPointers == 0)
-    return;
 
   OS.indent(Depth) << "Run-time memory checks:\n";
+
   unsigned N = 0;
-  for (unsigned I = 0; I < NumPointers; ++I)
-    for (unsigned J = I + 1; J < NumPointers; ++J)
-      if (needsChecking(I, J, PtrPartition)) {
-        OS.indent(Depth) << N++ << ":\n";
-        OS.indent(Depth + 2) << *Pointers[I];
-        if (PtrPartition)
-          OS << " (Partition: " << (*PtrPartition)[I] << ")";
-        OS << "\n";
-        OS.indent(Depth + 2) << *Pointers[J];
-        if (PtrPartition)
-          OS << " (Partition: " << (*PtrPartition)[J] << ")";
-        OS << "\n";
+  for (unsigned I = 0; I < CheckingGroups.size(); ++I)
+    for (unsigned J = I + 1; J < CheckingGroups.size(); ++J)
+      if (needsChecking(CheckingGroups[I], CheckingGroups[J], PtrPartition)) {
+        OS.indent(Depth) << "Check " << N++ << ":\n";
+        OS.indent(Depth + 2) << "Comparing group " << I << ":\n";
+
+        for (unsigned K = 0; K < CheckingGroups[I].Members.size(); ++K) {
+          OS.indent(Depth + 2)
+              << *Pointers[CheckingGroups[I].Members[K]].PointerValue << "\n";
+          if (PtrPartition)
+            OS << " (Partition: "
+               << (*PtrPartition)[CheckingGroups[I].Members[K]] << ")"
+               << "\n";
+        }
+
+        OS.indent(Depth + 2) << "Against group " << J << ":\n";
+
+        for (unsigned K = 0; K < CheckingGroups[J].Members.size(); ++K) {
+          OS.indent(Depth + 2)
+              << *Pointers[CheckingGroups[J].Members[K]].PointerValue << "\n";
+          if (PtrPartition)
+            OS << " (Partition: "
+               << (*PtrPartition)[CheckingGroups[J].Members[K]] << ")"
+               << "\n";
+        }
       }
+
+  OS.indent(Depth) << "Grouped accesses:\n";
+  for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
+    OS.indent(Depth + 2) << "Group " << I << ":\n";
+    OS.indent(Depth + 4) << "(Low: " << *CheckingGroups[I].Low
+                         << " High: " << *CheckingGroups[I].High << ")\n";
+    for (unsigned J = 0; J < CheckingGroups[I].Members.size(); ++J) {
+      OS.indent(Depth + 6) << "Member: "
+                           << *Pointers[CheckingGroups[I].Members[J]].Expr
+                           << "\n";
+    }
+  }
 }
 
-unsigned LoopAccessInfo::RuntimePointerCheck::getNumberOfChecks(
+unsigned RuntimePointerChecking::getNumberOfChecks(
     const SmallVectorImpl<int> *PtrPartition) const {
-  unsigned NumPointers = Pointers.size();
+
+  unsigned NumPartitions = CheckingGroups.size();
   unsigned CheckCount = 0;
 
-  for (unsigned I = 0; I < NumPointers; ++I)
-    for (unsigned J = I + 1; J < NumPointers; ++J)
-      if (needsChecking(I, J, PtrPartition))
+  for (unsigned I = 0; I < NumPartitions; ++I)
+    for (unsigned J = I + 1; J < NumPartitions; ++J)
+      if (needsChecking(CheckingGroups[I], CheckingGroups[J], PtrPartition))
         CheckCount++;
   return CheckCount;
 }
 
-bool LoopAccessInfo::RuntimePointerCheck::needsAnyChecking(
+bool RuntimePointerChecking::needsAnyChecking(
     const SmallVectorImpl<int> *PtrPartition) const {
-  return getNumberOfChecks(PtrPartition) != 0;
+  unsigned NumPointers = Pointers.size();
+
+  for (unsigned I = 0; I < NumPointers; ++I)
+    for (unsigned J = I + 1; J < NumPointers; ++J)
+      if (needsChecking(I, J, PtrPartition))
+        return true;
+  return false;
 }
 
 namespace {
@@ -207,7 +391,8 @@ public:
 
   AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI,
                  MemoryDepChecker::DepCandidates &DA)
-      : DL(Dl), AST(*AA), LI(LI), DepCands(DA), IsRTCheckNeeded(false) {}
+      : DL(Dl), AST(*AA), LI(LI), DepCands(DA),
+        IsRTCheckAnalysisNeeded(false) {}
 
   /// \brief Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
@@ -226,11 +411,12 @@ public:
   }
 
   /// \brief Check whether we can check the pointers at runtime for
-  /// non-intersection. Returns true when we have 0 pointers
-  /// (a check on 0 pointers for non-intersection will always return true).
-  bool canCheckPtrAtRT(LoopAccessInfo::RuntimePointerCheck &RtCheck,
-                       bool &NeedRTCheck, ScalarEvolution *SE, Loop *TheLoop,
-                       const ValueToValueMap &Strides,
+  /// non-intersection.
+  ///
+  /// Returns true if we need no check or if we do and we can generate them
+  /// (i.e. the pointers have computable bounds).
+  bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
+                       Loop *TheLoop, const ValueToValueMap &Strides,
                        bool ShouldCheckStride = false);
 
   /// \brief Goes over all memory accesses, checks whether a RT check is needed
@@ -239,8 +425,11 @@ public:
     processMemAccesses();
   }
 
-  bool isRTCheckNeeded() { return IsRTCheckNeeded; }
-
+  /// \brief Initial processing of memory accesses determined that we need to
+  /// perform dependency checking.
+  ///
+  /// Note that this can later be cleared if we retry memcheck analysis without
+  /// dependency checking (i.e. ShouldRetryWithRuntimeCheck).
   bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
 
   /// We decided that no dependence analysis would be used.  Reset the state.
@@ -255,7 +444,7 @@ private:
   typedef SetVector<MemAccessInfo> PtrAccessSet;
 
   /// \brief Go over all memory access and check whether runtime pointer checks
-  /// are needed /// and build sets of dependency check candidates.
+  /// are needed and build sets of dependency check candidates.
   void processMemAccesses();
 
   /// Set of all accesses.
@@ -280,7 +469,14 @@ private:
   /// dependence check.
   MemoryDepChecker::DepCandidates &DepCands;
 
-  bool IsRTCheckNeeded;
+  /// \brief Initial processing of memory accesses determined that we may need
+  /// to add memchecks.  Perform the analysis to determine the necessary checks.
+  ///
+  /// Note that, this is different from isDependencyCheckNeeded.  When we retry
+  /// memcheck analysis without dependency checking
+  /// (i.e. ShouldRetryWithRuntimeCheck), isDependencyCheckNeeded is cleared
+  /// while this remains set if we have potentially dependent accesses.
+  bool IsRTCheckAnalysisNeeded;
 };
 
 } // end anonymous namespace
@@ -296,16 +492,16 @@ static bool hasComputableBounds(ScalarEvolution *SE,
   return AR->isAffine();
 }
 
-bool AccessAnalysis::canCheckPtrAtRT(
-    LoopAccessInfo::RuntimePointerCheck &RtCheck, bool &NeedRTCheck,
-    ScalarEvolution *SE, Loop *TheLoop, const ValueToValueMap &StridesMap,
-    bool ShouldCheckStride) {
+bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
+                                     ScalarEvolution *SE, Loop *TheLoop,
+                                     const ValueToValueMap &StridesMap,
+                                     bool ShouldCheckStride) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRT = true;
 
-  NeedRTCheck = false;
-  if (!IsRTCheckNeeded) return true;
+  bool NeedRTCheck = false;
+  if (!IsRTCheckAnalysisNeeded) return true;
 
   bool IsDepCheckNeeded = isDependencyCheckNeeded();
 
@@ -313,6 +509,9 @@ bool AccessAnalysis::canCheckPtrAtRT(
   // Accesses between different groups doesn't need to be checked.
   unsigned ASId = 1;
   for (auto &AS : AST) {
+    int NumReadPtrChecks = 0;
+    int NumWritePtrChecks = 0;
+
     // We assign consecutive id to access from different dependence sets.
     // Accesses within the same set don't need a runtime check.
     unsigned RunningDepId = 1;
@@ -323,6 +522,11 @@ bool AccessAnalysis::canCheckPtrAtRT(
       bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
       MemAccessInfo Access(Ptr, IsWrite);
 
+      if (IsWrite)
+        ++NumWritePtrChecks;
+      else
+        ++NumReadPtrChecks;
+
       if (hasComputableBounds(SE, StridesMap, Ptr) &&
           // When we run after a failing dependency check we have to make sure
           // we don't have wrapping pointers.
@@ -341,7 +545,7 @@ bool AccessAnalysis::canCheckPtrAtRT(
           // Each access has its own dependence set.
           DepId = RunningDepId++;
 
-        RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
+        RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
 
         DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
       } else {
@@ -350,15 +554,21 @@ bool AccessAnalysis::canCheckPtrAtRT(
       }
     }
 
+    // If we have at least two writes or one write and a read then we need to
+    // check them.  But there is no need to checks if there is only one
+    // dependence set for this alias set.
+    //
+    // Note that this function computes CanDoRT and NeedRTCheck independently.
+    // For example CanDoRT=false, NeedRTCheck=false means that we have a pointer
+    // for which we couldn't find the bounds but we don't actually need to emit
+    // any checks so it does not matter.
+    if (!(IsDepCheckNeeded && CanDoRT && RunningDepId == 2))
+      NeedRTCheck |= (NumWritePtrChecks >= 2 || (NumReadPtrChecks >= 1 &&
+                                                 NumWritePtrChecks >= 1));
+
     ++ASId;
   }
 
-  // We need a runtime check if there are any accesses that need checking.
-  // However, some accesses cannot be checked (for example because we
-  // can't determine their bounds). In these cases we would need a check
-  // but wouldn't be able to add it.
-  NeedRTCheck = !CanDoRT || RtCheck.needsAnyChecking(nullptr);
-
   // If the pointers that we would use for the bounds comparison have different
   // address spaces, assume the values aren't directly comparable, so we can't
   // use them for the runtime check. We also have to assume they could
@@ -368,14 +578,15 @@ bool AccessAnalysis::canCheckPtrAtRT(
   for (unsigned i = 0; i < NumPointers; ++i) {
     for (unsigned j = i + 1; j < NumPointers; ++j) {
       // Only need to check pointers between two different dependency sets.
-      if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
+      if (RtCheck.Pointers[i].DependencySetId ==
+          RtCheck.Pointers[j].DependencySetId)
        continue;
       // Only need to check pointers in the same alias set.
-      if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j])
+      if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
         continue;
 
-      Value *PtrI = RtCheck.Pointers[i];
-      Value *PtrJ = RtCheck.Pointers[j];
+      Value *PtrI = RtCheck.Pointers[i].PointerValue;
+      Value *PtrJ = RtCheck.Pointers[j].PointerValue;
 
       unsigned ASi = PtrI->getType()->getPointerAddressSpace();
       unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
@@ -387,7 +598,18 @@ bool AccessAnalysis::canCheckPtrAtRT(
     }
   }
 
-  return CanDoRT;
+  if (NeedRTCheck && CanDoRT)
+    RtCheck.groupChecks(DepCands, IsDepCheckNeeded);
+
+  DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks(nullptr)
+               << " pointer comparisons.\n");
+
+  RtCheck.Need = NeedRTCheck;
+
+  bool CanDoRTIfNeeded = !NeedRTCheck || CanDoRT;
+  if (!CanDoRTIfNeeded)
+    RtCheck.reset();
+  return CanDoRTIfNeeded;
 }
 
 void AccessAnalysis::processMemAccesses() {
@@ -470,7 +692,7 @@ void AccessAnalysis::processMemAccesses() {
           // catch "a[i] = a[i] + " without having to do a dependence check).
           if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
             CheckDeps.insert(Access);
-            IsRTCheckNeeded = true;
+            IsRTCheckAnalysisNeeded = true;
           }
 
           if (IsWrite)
@@ -600,7 +822,7 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
   // Check the step is constant.
   const SCEV *Step = AR->getStepRecurrence(*SE);
 
-  // Calculate the pointer stride and check if it is consecutive.
+  // Calculate the pointer stride and check if it is constant.
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
   if (!C) {
     DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr <<
@@ -805,11 +1027,11 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
         << *InstMap[BIdx] << ": " << *Dist << "\n");
 
-  // Need consecutive accesses. We don't want to vectorize
+  // Need accesses with constant stride. We don't want to vectorize
   // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
   // the address space.
   if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
-    DEBUG(dbgs() << "Non-consecutive pointer access\n");
+    DEBUG(dbgs() << "Pointer access with non-constant stride\n");
     return Dependence::Unknown;
   }
 
@@ -859,8 +1081,10 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
 
   unsigned Stride = std::abs(StrideAPtr);
   if (Stride > 1 &&
-      areStridedAccessesIndependent(Distance, Stride, TypeByteSize))
+      areStridedAccessesIndependent(Distance, Stride, TypeByteSize)) {
+    DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
     return Dependence::NoDep;
+  }
 
   // Bail out early if passed-in parameters make vectorization not feasible.
   unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
@@ -1098,8 +1322,8 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
   unsigned NumReads = 0;
   unsigned NumReadWrites = 0;
 
-  PtrRtCheck.Pointers.clear();
-  PtrRtCheck.Need = false;
+  PtrRtChecking.Pointers.clear();
+  PtrRtChecking.Need = false;
 
   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
 
@@ -1258,28 +1482,17 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
 
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
-  bool NeedRTCheck;
-  bool CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck,
-                                          NeedRTCheck, SE,
-                                          TheLoop, Strides);
-
-  DEBUG(dbgs() << "LAA: We need to do "
-               << PtrRtCheck.getNumberOfChecks(nullptr)
-               << " pointer comparisons.\n");
-
-  // Check that we found the bounds for the pointer.
-  if (CanDoRT)
-    DEBUG(dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
-  else if (NeedRTCheck) {
+  bool CanDoRTIfNeeded =
+      Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides);
+  if (!CanDoRTIfNeeded) {
     emitAnalysis(LoopAccessReport() << "cannot identify array bounds");
-    DEBUG(dbgs() << "LAA: We can't vectorize because we can't find " <<
-          "the array bounds.\n");
-    PtrRtCheck.reset();
+    DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
+                 << "the array bounds.\n");
     CanVecMem = false;
     return;
   }
 
-  PtrRtCheck.Need = NeedRTCheck;
+  DEBUG(dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
 
   CanVecMem = true;
   if (Accesses.isDependencyCheckNeeded()) {
@@ -1290,23 +1503,21 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
 
     if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
       DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
-      NeedRTCheck = true;
 
       // Clear the dependency checks. We assume they are not needed.
       Accesses.resetDepChecks(DepChecker);
 
-      PtrRtCheck.reset();
-      PtrRtCheck.Need = true;
+      PtrRtChecking.reset();
+      PtrRtChecking.Need = true;
 
-      CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NeedRTCheck, SE,
-                                         TheLoop, Strides, true);
+      CanDoRTIfNeeded =
+          Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides, true);
 
       // Check that we found the bounds for the pointer.
-      if (NeedRTCheck && !CanDoRT) {
+      if (!CanDoRTIfNeeded) {
         emitAnalysis(LoopAccessReport()
                      << "cannot check memory dependencies at runtime");
         DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
-        PtrRtCheck.reset();
         CanVecMem = false;
         return;
       }
@@ -1317,8 +1528,8 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
 
   if (CanVecMem)
     DEBUG(dbgs() << "LAA: No unsafe dependent memory operations in loop.  We"
-                 << (NeedRTCheck ? "" : " don't")
-                 << " need a runtime memory check.\n");
+                 << (PtrRtChecking.Need ? "" : " don't")
+                 << " need runtime memory checks.\n");
   else {
     emitAnalysis(LoopAccessReport() <<
                  "unsafe dependent memory operations in loop");
@@ -1357,35 +1568,38 @@ static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
 
 std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeCheck(
     Instruction *Loc, const SmallVectorImpl<int> *PtrPartition) const {
-  if (!PtrRtCheck.Need)
+  if (!PtrRtChecking.Need)
     return std::make_pair(nullptr, nullptr);
 
-  unsigned NumPointers = PtrRtCheck.Pointers.size();
-  SmallVector<TrackingVH<Value> , 2> Starts;
-  SmallVector<TrackingVH<Value> , 2> Ends;
+  SmallVector<TrackingVH<Value>, 2> Starts;
+  SmallVector<TrackingVH<Value>, 2> Ends;
 
   LLVMContext &Ctx = Loc->getContext();
   SCEVExpander Exp(*SE, DL, "induction");
   Instruction *FirstInst = nullptr;
 
-  for (unsigned i = 0; i < NumPointers; ++i) {
-    Value *Ptr = PtrRtCheck.Pointers[i];
+  for (unsigned i = 0; i < PtrRtChecking.CheckingGroups.size(); ++i) {
+    const RuntimePointerChecking::CheckingPtrGroup &CG =
+        PtrRtChecking.CheckingGroups[i];
+    Value *Ptr = PtrRtChecking.Pointers[CG.Members[0]].PointerValue;
     const SCEV *Sc = SE->getSCEV(Ptr);
 
     if (SE->isLoopInvariant(Sc, TheLoop)) {
-      DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" <<
-            *Ptr <<"\n");
+      DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" << *Ptr
+                   << "\n");
       Starts.push_back(Ptr);
       Ends.push_back(Ptr);
     } else {
-      DEBUG(dbgs() << "LAA: Adding RT check for range:" << *Ptr << '\n');
       unsigned AS = Ptr->getType()->getPointerAddressSpace();
 
       // Use this type for pointer arithmetic.
       Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
+      Value *Start = nullptr, *End = nullptr;
 
-      Value *Start = Exp.expandCodeFor(PtrRtCheck.Starts[i], PtrArithTy, Loc);
-      Value *End = Exp.expandCodeFor(PtrRtCheck.Ends[i], PtrArithTy, Loc);
+      DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
+      Start = Exp.expandCodeFor(CG.Low, PtrArithTy, Loc);
+      End = Exp.expandCodeFor(CG.High, PtrArithTy, Loc);
+      DEBUG(dbgs() << "Start: " << *CG.Low << " End: " << *CG.High << "\n");
       Starts.push_back(Start);
       Ends.push_back(End);
     }
@@ -1394,9 +1608,14 @@ std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeCheck(
   IRBuilder<> ChkBuilder(Loc);
   // Our instructions might fold to a constant.
   Value *MemoryRuntimeCheck = nullptr;
-  for (unsigned i = 0; i < NumPointers; ++i) {
-    for (unsigned j = i+1; j < NumPointers; ++j) {
-      if (!PtrRtCheck.needsChecking(i, j, PtrPartition))
+  for (unsigned i = 0; i < PtrRtChecking.CheckingGroups.size(); ++i) {
+    for (unsigned j = i + 1; j < PtrRtChecking.CheckingGroups.size(); ++j) {
+      const RuntimePointerChecking::CheckingPtrGroup &CGI =
+          PtrRtChecking.CheckingGroups[i];
+      const RuntimePointerChecking::CheckingPtrGroup &CGJ =
+          PtrRtChecking.CheckingGroups[j];
+
+      if (!PtrRtChecking.needsChecking(CGI, CGJ, PtrPartition))
         continue;
 
       unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
@@ -1447,7 +1666,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const TargetLibraryInfo *TLI, AliasAnalysis *AA,
                                DominatorTree *DT, LoopInfo *LI,
                                const ValueToValueMap &Strides)
-    : DepChecker(SE, L), TheLoop(L), SE(SE), DL(DL),
+    : PtrRtChecking(SE), DepChecker(SE, L), TheLoop(L), SE(SE), DL(DL),
       TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
       MaxSafeDepDistBytes(-1U), CanVecMem(false),
       StoreToLoopInvariantAddress(false) {
@@ -1457,7 +1676,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
 
 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
   if (CanVecMem) {
-    if (PtrRtCheck.Need)
+    if (PtrRtChecking.Need)
       OS.indent(Depth) << "Memory dependences are safe with run-time checks\n";
     else
       OS.indent(Depth) << "Memory dependences are safe\n";
@@ -1476,7 +1695,7 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
     OS.indent(Depth) << "Too many interesting dependences, not recorded\n";
 
   // List the pair of accesses need run-time checks to prove independence.
-  PtrRtCheck.print(OS, Depth);
+  PtrRtChecking.print(OS, Depth);
   OS << "\n";
 
   OS.indent(Depth) << "Store to invariant address was "
diff --git a/lib/Analysis/NoAliasAnalysis.cpp b/lib/Analysis/NoAliasAnalysis.cpp
index 7617622b9ab6..322a9a80de4c 100644
--- a/lib/Analysis/NoAliasAnalysis.cpp
+++ b/lib/Analysis/NoAliasAnalysis.cpp
@@ -72,7 +72,6 @@ namespace {
     }
 
     void deleteValue(Value *V) override {}
-    void copyValue(Value *From, Value *To) override {}
     void addEscapingUse(Use &U) override {}
 
     /// getAdjustedAnalysisPointer - This method is used when a pass implements
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 520d1e5ef87d..7d1c3fbef68a 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -28,12 +28,12 @@ namespace {
 ///
 /// This is used when no target specific information is available.
 struct NoTTIImpl : TargetTransformInfoImplCRTPBase<NoTTIImpl> {
-  explicit NoTTIImpl(const DataLayout *DL)
+  explicit NoTTIImpl(const DataLayout &DL)
       : TargetTransformInfoImplCRTPBase<NoTTIImpl>(DL) {}
 };
 }
 
-TargetTransformInfo::TargetTransformInfo(const DataLayout *DL)
+TargetTransformInfo::TargetTransformInfo(const DataLayout &DL)
     : TTIImpl(new Model<NoTTIImpl>(NoTTIImpl(DL))) {}
 
 TargetTransformInfo::~TargetTransformInfo() {}
@@ -304,7 +304,7 @@ TargetIRAnalysis::Result TargetIRAnalysis::run(Function &F) {
 char TargetIRAnalysis::PassID;
 
 TargetIRAnalysis::Result TargetIRAnalysis::getDefaultTTI(Function &F) {
-  return Result(&F.getParent()->getDataLayout());
+  return Result(F.getParent()->getDataLayout());
 }
 
 // Register the basic pass.
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index c45005f343d3..fa0d7798cae9 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -1464,7 +1464,7 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
           // If the object is defined in the current Module, we'll be giving
           // it the preferred alignment. Otherwise, we have to assume that it
           // may only have the minimum ABI alignment.
-          if (!GVar->isDeclaration() && !GVar->isWeakForLinker())
+          if (GVar->isStrongDefinitionForLinker())
             Align = DL.getPreferredAlignment(GVar);
           else
             Align = DL.getABITypeAlignment(ObjectType);
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 96fddd103cc5..67f68dc8391e 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -11,7 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Value.h"
 
 /// \brief Identify if the intrinsic is trivially vectorizable.
 /// This method returns true if the intrinsic's argument types are all
@@ -211,3 +217,195 @@ llvm::Intrinsic::ID llvm::getIntrinsicIDForCall(CallInst *CI,
 
   return Intrinsic::not_intrinsic;
 }
+
+/// \brief Find the operand of the GEP that should be checked for consecutive
+/// stores. This ignores trailing indices that have no effect on the final
+/// pointer.
+unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep) {
+  const DataLayout &DL = Gep->getModule()->getDataLayout();
+  unsigned LastOperand = Gep->getNumOperands() - 1;
+  unsigned GEPAllocSize = DL.getTypeAllocSize(
+      cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
+
+  // Walk backwards and try to peel off zeros.
+  while (LastOperand > 1 &&
+         match(Gep->getOperand(LastOperand), llvm::PatternMatch::m_Zero())) {
+    // Find the type we're currently indexing into.
+    gep_type_iterator GEPTI = gep_type_begin(Gep);
+    std::advance(GEPTI, LastOperand - 1);
+
+    // If it's a type with the same allocation size as the result of the GEP we
+    // can peel off the zero index.
+    if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize)
+      break;
+    --LastOperand;
+  }
+
+  return LastOperand;
+}
+
+/// \brief If the argument is a GEP, then returns the operand identified by
+/// getGEPInductionOperand. However, if there is some other non-loop-invariant
+/// operand, it returns that instead.
+llvm::Value *llvm::stripGetElementPtr(llvm::Value *Ptr, ScalarEvolution *SE,
+                                      Loop *Lp) {
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP)
+    return Ptr;
+
+  unsigned InductionOperand = getGEPInductionOperand(GEP);
+
+  // Check that all of the gep indices are uniform except for our induction
+  // operand.
+  for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
+    if (i != InductionOperand &&
+        !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
+      return Ptr;
+  return GEP->getOperand(InductionOperand);
+}
+
+/// \brief If a value has only one user that is a CastInst, return it.
+llvm::Value *llvm::getUniqueCastUse(llvm::Value *Ptr, Loop *Lp, Type *Ty) {
+  llvm::Value *UniqueCast = nullptr;
+  for (User *U : Ptr->users()) {
+    CastInst *CI = dyn_cast<CastInst>(U);
+    if (CI && CI->getType() == Ty) {
+      if (!UniqueCast)
+        UniqueCast = CI;
+      else
+        return nullptr;
+    }
+  }
+  return UniqueCast;
+}
+
+/// \brief Get the stride of a pointer access in a loop. Looks for symbolic
+/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
+llvm::Value *llvm::getStrideFromPointer(llvm::Value *Ptr, ScalarEvolution *SE,
+                                        Loop *Lp) {
+  const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy || PtrTy->isAggregateType())
+    return nullptr;
+
+  // Try to remove a gep instruction to make the pointer (actually index at this
+  // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
+  // pointer, otherwise, we are analyzing the index.
+  llvm::Value *OrigPtr = Ptr;
+
+  // The size of the pointer access.
+  int64_t PtrAccessSize = 1;
+
+  Ptr = stripGetElementPtr(Ptr, SE, Lp);
+  const SCEV *V = SE->getSCEV(Ptr);
+
+  if (Ptr != OrigPtr)
+    // Strip off casts.
+    while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
+      V = C->getOperand();
+
+  const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
+  if (!S)
+    return nullptr;
+
+  V = S->getStepRecurrence(*SE);
+  if (!V)
+    return nullptr;
+
+  // Strip off the size of access multiplication if we are still analyzing the
+  // pointer.
+  if (OrigPtr == Ptr) {
+    const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout();
+    DL.getTypeAllocSize(PtrTy->getElementType());
+    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
+      if (M->getOperand(0)->getSCEVType() != scConstant)
+        return nullptr;
+
+      const APInt &APStepVal =
+          cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
+
+      // Huge step value - give up.
+      if (APStepVal.getBitWidth() > 64)
+        return nullptr;
+
+      int64_t StepVal = APStepVal.getSExtValue();
+      if (PtrAccessSize != StepVal)
+        return nullptr;
+      V = M->getOperand(1);
+    }
+  }
+
+  // Strip off casts.
+  Type *StripedOffRecurrenceCast = nullptr;
+  if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
+    StripedOffRecurrenceCast = C->getType();
+    V = C->getOperand();
+  }
+
+  // Look for the loop invariant symbolic value.
+  const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
+  if (!U)
+    return nullptr;
+
+  llvm::Value *Stride = U->getValue();
+  if (!Lp->isLoopInvariant(Stride))
+    return nullptr;
+
+  // If we have stripped off the recurrence cast we have to make sure that we
+  // return the value that is used in this loop so that we can replace it later.
+  if (StripedOffRecurrenceCast)
+    Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);
+
+  return Stride;
+}
+
+/// \brief Given a vector and an element number, see if the scalar value is
+/// already around as a register, for example if it were inserted then extracted
+/// from the vector.
+llvm::Value *llvm::findScalarElement(llvm::Value *V, unsigned EltNo) {
+  assert(V->getType()->isVectorTy() && "Not looking at a vector?");
+  VectorType *VTy = cast<VectorType>(V->getType());
+  unsigned Width = VTy->getNumElements();
+  if (EltNo >= Width)  // Out of range access.
+    return UndefValue::get(VTy->getElementType());
+
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C->getAggregateElement(EltNo);
+
+  if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert to a variable element, we don't know what it is.
+    if (!isa<ConstantInt>(III->getOperand(2)))
+      return nullptr;
+    unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
+
+    // If this is an insert to the element we are looking for, return the
+    // inserted value.
+    if (EltNo == IIElt)
+      return III->getOperand(1);
+
+    // Otherwise, the insertelement doesn't modify the value, recurse on its
+    // vector input.
+    return findScalarElement(III->getOperand(0), EltNo);
+  }
+
+  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
+    unsigned LHSWidth = SVI->getOperand(0)->getType()->getVectorNumElements();
+    int InEl = SVI->getMaskValue(EltNo);
+    if (InEl < 0)
+      return UndefValue::get(VTy->getElementType());
+    if (InEl < (int)LHSWidth)
+      return findScalarElement(SVI->getOperand(0), InEl);
+    return findScalarElement(SVI->getOperand(1), InEl - LHSWidth);
+  }
+
+  // Extract a value from a vector add operation with a constant zero.
+  Value *Val = nullptr; Constant *Con = nullptr;
+  if (match(V,
+            llvm::PatternMatch::m_Add(llvm::PatternMatch::m_Value(Val),
+                                      llvm::PatternMatch::m_Constant(Con)))) {
+    if (Con->getAggregateElement(EltNo)->isNullValue())
+      return findScalarElement(Val, EltNo);
+  }
+
+  // Otherwise, we don't know.
+  return nullptr;
+}
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 88f359d4fd5c..5c4bab734b2b 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -593,6 +593,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(attributes);
 
   KEYWORD(alwaysinline);
+  KEYWORD(argmemonly);
   KEYWORD(builtin);
   KEYWORD(byval);
   KEYWORD(inalloca);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index b3c7fa087d40..1c6e7bd18d0e 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -946,35 +946,42 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
       B.addStackAlignmentAttr(Alignment);
       continue;
     }
-    case lltok::kw_alwaysinline:      B.addAttribute(Attribute::AlwaysInline); break;
-    case lltok::kw_builtin:           B.addAttribute(Attribute::Builtin); break;
-    case lltok::kw_cold:              B.addAttribute(Attribute::Cold); break;
-    case lltok::kw_convergent:        B.addAttribute(Attribute::Convergent); break;
-    case lltok::kw_inlinehint:        B.addAttribute(Attribute::InlineHint); break;
-    case lltok::kw_jumptable:         B.addAttribute(Attribute::JumpTable); break;
-    case lltok::kw_minsize:           B.addAttribute(Attribute::MinSize); break;
-    case lltok::kw_naked:             B.addAttribute(Attribute::Naked); break;
-    case lltok::kw_nobuiltin:         B.addAttribute(Attribute::NoBuiltin); break;
-    case lltok::kw_noduplicate:       B.addAttribute(Attribute::NoDuplicate); break;
-    case lltok::kw_noimplicitfloat:   B.addAttribute(Attribute::NoImplicitFloat); break;
-    case lltok::kw_noinline:          B.addAttribute(Attribute::NoInline); break;
-    case lltok::kw_nonlazybind:       B.addAttribute(Attribute::NonLazyBind); break;
-    case lltok::kw_noredzone:         B.addAttribute(Attribute::NoRedZone); break;
-    case lltok::kw_noreturn:          B.addAttribute(Attribute::NoReturn); break;
-    case lltok::kw_nounwind:          B.addAttribute(Attribute::NoUnwind); break;
-    case lltok::kw_optnone:           B.addAttribute(Attribute::OptimizeNone); break;
-    case lltok::kw_optsize:           B.addAttribute(Attribute::OptimizeForSize); break;
-    case lltok::kw_readnone:          B.addAttribute(Attribute::ReadNone); break;
-    case lltok::kw_readonly:          B.addAttribute(Attribute::ReadOnly); break;
-    case lltok::kw_returns_twice:     B.addAttribute(Attribute::ReturnsTwice); break;
-    case lltok::kw_ssp:               B.addAttribute(Attribute::StackProtect); break;
-    case lltok::kw_sspreq:            B.addAttribute(Attribute::StackProtectReq); break;
-    case lltok::kw_sspstrong:         B.addAttribute(Attribute::StackProtectStrong); break;
-    case lltok::kw_safestack:         B.addAttribute(Attribute::SafeStack); break;
-    case lltok::kw_sanitize_address:  B.addAttribute(Attribute::SanitizeAddress); break;
-    case lltok::kw_sanitize_thread:   B.addAttribute(Attribute::SanitizeThread); break;
-    case lltok::kw_sanitize_memory:   B.addAttribute(Attribute::SanitizeMemory); break;
-    case lltok::kw_uwtable:           B.addAttribute(Attribute::UWTable); break;
+    case lltok::kw_alwaysinline: B.addAttribute(Attribute::AlwaysInline); break;
+    case lltok::kw_argmemonly: B.addAttribute(Attribute::ArgMemOnly); break;
+    case lltok::kw_builtin: B.addAttribute(Attribute::Builtin); break;
+    case lltok::kw_cold: B.addAttribute(Attribute::Cold); break;
+    case lltok::kw_convergent: B.addAttribute(Attribute::Convergent); break;
+    case lltok::kw_inlinehint: B.addAttribute(Attribute::InlineHint); break;
+    case lltok::kw_jumptable: B.addAttribute(Attribute::JumpTable); break;
+    case lltok::kw_minsize: B.addAttribute(Attribute::MinSize); break;
+    case lltok::kw_naked: B.addAttribute(Attribute::Naked); break;
+    case lltok::kw_nobuiltin: B.addAttribute(Attribute::NoBuiltin); break;
+    case lltok::kw_noduplicate: B.addAttribute(Attribute::NoDuplicate); break;
+    case lltok::kw_noimplicitfloat:
+      B.addAttribute(Attribute::NoImplicitFloat); break;
+    case lltok::kw_noinline: B.addAttribute(Attribute::NoInline); break;
+    case lltok::kw_nonlazybind: B.addAttribute(Attribute::NonLazyBind); break;
+    case lltok::kw_noredzone: B.addAttribute(Attribute::NoRedZone); break;
+    case lltok::kw_noreturn: B.addAttribute(Attribute::NoReturn); break;
+    case lltok::kw_nounwind: B.addAttribute(Attribute::NoUnwind); break;
+    case lltok::kw_optnone: B.addAttribute(Attribute::OptimizeNone); break;
+    case lltok::kw_optsize: B.addAttribute(Attribute::OptimizeForSize); break;
+    case lltok::kw_readnone: B.addAttribute(Attribute::ReadNone); break;
+    case lltok::kw_readonly: B.addAttribute(Attribute::ReadOnly); break;
+    case lltok::kw_returns_twice:
+      B.addAttribute(Attribute::ReturnsTwice); break;
+    case lltok::kw_ssp: B.addAttribute(Attribute::StackProtect); break;
+    case lltok::kw_sspreq: B.addAttribute(Attribute::StackProtectReq); break;
+    case lltok::kw_sspstrong:
+      B.addAttribute(Attribute::StackProtectStrong); break;
+    case lltok::kw_safestack: B.addAttribute(Attribute::SafeStack); break;
+    case lltok::kw_sanitize_address:
+      B.addAttribute(Attribute::SanitizeAddress); break;
+    case lltok::kw_sanitize_thread:
+      B.addAttribute(Attribute::SanitizeThread); break;
+    case lltok::kw_sanitize_memory:
+      B.addAttribute(Attribute::SanitizeMemory); break;
+    case lltok::kw_uwtable: B.addAttribute(Attribute::UWTable); break;
 
     // Error handling.
     case lltok::kw_inreg:
@@ -1258,6 +1265,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
 
     case lltok::kw_alignstack:
     case lltok::kw_alwaysinline:
+    case lltok::kw_argmemonly:
     case lltok::kw_builtin:
     case lltok::kw_inlinehint:
     case lltok::kw_jumptable:
@@ -1334,6 +1342,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
 
     case lltok::kw_alignstack:
     case lltok::kw_alwaysinline:
+    case lltok::kw_argmemonly:
     case lltok::kw_builtin:
     case lltok::kw_cold:
     case lltok::kw_inlinehint:
@@ -2873,8 +2882,8 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
         if (ValTy->isVectorTy() != BaseType->isVectorTy())
           return Error(ID.Loc, "getelementptr index type missmatch");
         if (ValTy->isVectorTy()) {
-          unsigned ValNumEl = cast<VectorType>(ValTy)->getNumElements();
-          unsigned PtrNumEl = cast<VectorType>(BaseType)->getNumElements();
+          unsigned ValNumEl = ValTy->getVectorNumElements();
+          unsigned PtrNumEl = BaseType->getVectorNumElements();
           if (ValNumEl != PtrNumEl)
             return Error(
                 ID.Loc,
@@ -4534,8 +4543,17 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_and:
   case lltok::kw_or:
   case lltok::kw_xor:    return ParseLogical(Inst, PFS, KeywordVal);
-  case lltok::kw_icmp:
-  case lltok::kw_fcmp:   return ParseCompare(Inst, PFS, KeywordVal);
+  case lltok::kw_icmp:   return ParseCompare(Inst, PFS, KeywordVal);
+  case lltok::kw_fcmp: {
+    FastMathFlags FMF = EatFastMathFlagsIfPresent();
+    int Res = ParseCompare(Inst, PFS, KeywordVal);
+    if (Res != 0)
+      return Res;
+    if (FMF.any())
+      Inst->setFastMathFlags(FMF);
+    return 0;
+  }
+
   // Casts.
   case lltok::kw_trunc:
   case lltok::kw_zext:
@@ -5572,6 +5590,11 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
 
   SmallVector<Value*, 16> Indices;
   bool AteExtraComma = false;
+  // GEP returns a vector of pointers if at least one of parameters is a vector.
+  // All vector parameters should have the same vector width.
+  unsigned GEPWidth = BaseType->isVectorTy() ?
+    BaseType->getVectorNumElements() : 0;
+
   while (EatIfPresent(lltok::comma)) {
     if (Lex.getKind() == lltok::MetadataVar) {
       AteExtraComma = true;
@@ -5580,14 +5603,13 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
     if (ParseTypeAndValue(Val, EltLoc, PFS)) return true;
     if (!Val->getType()->getScalarType()->isIntegerTy())
       return Error(EltLoc, "getelementptr index must be an integer");
-    if (Val->getType()->isVectorTy() != Ptr->getType()->isVectorTy())
-      return Error(EltLoc, "getelementptr index type missmatch");
+
     if (Val->getType()->isVectorTy()) {
-      unsigned ValNumEl = cast<VectorType>(Val->getType())->getNumElements();
-      unsigned PtrNumEl = cast<VectorType>(Ptr->getType())->getNumElements();
-      if (ValNumEl != PtrNumEl)
+      unsigned ValNumEl = Val->getType()->getVectorNumElements();
+      if (GEPWidth && GEPWidth != ValNumEl)
         return Error(EltLoc,
           "getelementptr vector index has a wrong number of elements");
+      GEPWidth = ValNumEl;
     }
     Indices.push_back(Val);
   }
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 2487d1208133..691f085f0c9f 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -100,6 +100,7 @@ namespace lltok {
     // Attributes:
     kw_attributes,
     kw_alwaysinline,
+    kw_argmemonly,
     kw_sanitize_address,
     kw_builtin,
     kw_byval,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 09f0b689bdc3..c04e8b9f1f37 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -697,6 +697,21 @@ static Comdat::SelectionKind getDecodedComdatSelectionKind(unsigned Val) {
   }
 }
 
+static FastMathFlags getDecodedFastMathFlags(unsigned Val) {
+  FastMathFlags FMF;
+  if (0 != (Val & FastMathFlags::UnsafeAlgebra))
+    FMF.setUnsafeAlgebra();
+  if (0 != (Val & FastMathFlags::NoNaNs))
+    FMF.setNoNaNs();
+  if (0 != (Val & FastMathFlags::NoInfs))
+    FMF.setNoInfs();
+  if (0 != (Val & FastMathFlags::NoSignedZeros))
+    FMF.setNoSignedZeros();
+  if (0 != (Val & FastMathFlags::AllowReciprocal))
+    FMF.setAllowReciprocal();
+  return FMF;
+}
+
 static void upgradeDLLImportExportLinkage(llvm::GlobalValue *GV, unsigned Val) {
   switch (Val) {
   case 5: GV->setDLLStorageClass(GlobalValue::DLLImportStorageClass); break;
@@ -1075,6 +1090,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Alignment;
   case bitc::ATTR_KIND_ALWAYS_INLINE:
     return Attribute::AlwaysInline;
+  case bitc::ATTR_KIND_ARGMEMONLY:
+    return Attribute::ArgMemOnly;
   case bitc::ATTR_KIND_BUILTIN:
     return Attribute::Builtin;
   case bitc::ATTR_KIND_BY_VAL:
@@ -3472,17 +3489,7 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
           if (Record[OpNum] & (1 << bitc::PEO_EXACT))
             cast<BinaryOperator>(I)->setIsExact(true);
         } else if (isa<FPMathOperator>(I)) {
-          FastMathFlags FMF;
-          if (0 != (Record[OpNum] & FastMathFlags::UnsafeAlgebra))
-            FMF.setUnsafeAlgebra();
-          if (0 != (Record[OpNum] & FastMathFlags::NoNaNs))
-            FMF.setNoNaNs();
-          if (0 != (Record[OpNum] & FastMathFlags::NoInfs))
-            FMF.setNoInfs();
-          if (0 != (Record[OpNum] & FastMathFlags::NoSignedZeros))
-            FMF.setNoSignedZeros();
-          if (0 != (Record[OpNum] & FastMathFlags::AllowReciprocal))
-            FMF.setAllowReciprocal();
+          FastMathFlags FMF = getDecodedFastMathFlags(Record[OpNum]);
           if (FMF.any())
             I->setFastMathFlags(FMF);
         }
@@ -3739,14 +3746,25 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *LHS, *RHS;
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
-          popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
-          OpNum+1 != Record.size())
+          popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS))
+        return error("Invalid record");
+
+      unsigned PredVal = Record[OpNum];
+      bool IsFP = LHS->getType()->isFPOrFPVectorTy();
+      FastMathFlags FMF;
+      if (IsFP && Record.size() > OpNum+1)
+        FMF = getDecodedFastMathFlags(Record[++OpNum]);
+
+      if (OpNum+1 != Record.size())
         return error("Invalid record");
 
       if (LHS->getType()->isFPOrFPVectorTy())
-        I = new FCmpInst((FCmpInst::Predicate)Record[OpNum], LHS, RHS);
+        I = new FCmpInst((FCmpInst::Predicate)PredVal, LHS, RHS);
       else
-        I = new ICmpInst((ICmpInst::Predicate)Record[OpNum], LHS, RHS);
+        I = new ICmpInst((ICmpInst::Predicate)PredVal, LHS, RHS);
+
+      if (FMF.any())
+        I->setFastMathFlags(FMF);
       InstructionList.push_back(I);
       break;
     }
@@ -4458,14 +4476,11 @@ std::error_code BitcodeReader::materialize(GlobalValue *GV) {
 
   // Upgrade any old intrinsic calls in the function.
   for (auto &I : UpgradedIntrinsics) {
-    if (I.first != I.second) {
-      for (auto UI = I.first->user_begin(), UE = I.first->user_end();
-           UI != UE;) {
-        User *U = *UI;
-        ++UI;
-        if (CallInst *CI = dyn_cast<CallInst>(U))
-          UpgradeIntrinsicCall(CI, I.second);
-      }
+    for (auto UI = I.first->user_begin(), UE = I.first->user_end(); UI != UE;) {
+      User *U = *UI;
+      ++UI;
+      if (CallInst *CI = dyn_cast<CallInst>(U))
+        UpgradeIntrinsicCall(CI, I.second);
     }
   }
 
@@ -4533,15 +4548,13 @@ std::error_code BitcodeReader::materializeModule(Module *M) {
   // module is materialized because there could always be another function body
   // with calls to the old function.
   for (auto &I : UpgradedIntrinsics) {
-    if (I.first != I.second) {
-      for (auto *U : I.first->users()) {
-        if (CallInst *CI = dyn_cast<CallInst>(U))
-          UpgradeIntrinsicCall(CI, I.second);
-      }
-      if (!I.first->use_empty())
-        I.first->replaceAllUsesWith(I.second);
-      I.first->eraseFromParent();
+    for (auto *U : I.first->users()) {
+      if (CallInst *CI = dyn_cast<CallInst>(U))
+        UpgradeIntrinsicCall(CI, I.second);
     }
+    if (!I.first->use_empty())
+      I.first->replaceAllUsesWith(I.second);
+    I.first->eraseFromParent();
   }
   UpgradedIntrinsics.clear();
 
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 622f7eaf0784..1a70ba5ac127 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -162,6 +162,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_ALIGNMENT;
   case Attribute::AlwaysInline:
     return bitc::ATTR_KIND_ALWAYS_INLINE;
+  case Attribute::ArgMemOnly:
+    return bitc::ATTR_KIND_ARGMEMONLY;
   case Attribute::Builtin:
     return bitc::ATTR_KIND_BUILTIN;
   case Attribute::ByVal:
@@ -1759,13 +1761,17 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     pushValue(I.getOperand(2), InstID, Vals, VE);
     break;
   case Instruction::ICmp:
-  case Instruction::FCmp:
+  case Instruction::FCmp: {
     // compare returning Int1Ty or vector of Int1Ty
     Code = bitc::FUNC_CODE_INST_CMP2;
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);
     pushValue(I.getOperand(1), InstID, Vals, VE);
     Vals.push_back(cast<CmpInst>(I).getPredicate());
+    uint64_t Flags = GetOptimizationFlags(&I);
+    if (Flags != 0)
+      Vals.push_back(Flags);
     break;
+  }
 
   case Instruction::Ret:
     {
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 3224fac25cb4..98d4c8afc7b9 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -81,27 +81,27 @@ unsigned llvm::ComputeLinearIndex(Type *Ty,
 /// If Offsets is non-null, it points to a vector to be filled in
 /// with the in-memory offsets of each of the individual values.
 ///
-void llvm::ComputeValueVTs(const TargetLowering &TLI, Type *Ty,
-                           SmallVectorImpl<EVT> &ValueVTs,
+void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                           Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
                            SmallVectorImpl<uint64_t> *Offsets,
                            uint64_t StartingOffset) {
   // Given a struct type, recursively traverse the elements.
   if (StructType *STy = dyn_cast<StructType>(Ty)) {
-    const StructLayout *SL = TLI.getDataLayout()->getStructLayout(STy);
+    const StructLayout *SL = DL.getStructLayout(STy);
     for (StructType::element_iterator EB = STy->element_begin(),
                                       EI = EB,
                                       EE = STy->element_end();
          EI != EE; ++EI)
-      ComputeValueVTs(TLI, *EI, ValueVTs, Offsets,
+      ComputeValueVTs(TLI, DL, *EI, ValueVTs, Offsets,
                       StartingOffset + SL->getElementOffset(EI - EB));
     return;
   }
   // Given an array type, recursively traverse the elements.
   if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Type *EltTy = ATy->getElementType();
-    uint64_t EltSize = TLI.getDataLayout()->getTypeAllocSize(EltTy);
+    uint64_t EltSize = DL.getTypeAllocSize(EltTy);
     for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
-      ComputeValueVTs(TLI, EltTy, ValueVTs, Offsets,
+      ComputeValueVTs(TLI, DL, EltTy, ValueVTs, Offsets,
                       StartingOffset + i * EltSize);
     return;
   }
@@ -109,7 +109,7 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, Type *Ty,
   if (Ty->isVoidTy())
     return;
   // Base case: we can get an EVT for this LLVM IR type.
-  ValueVTs.push_back(TLI.getValueType(Ty));
+  ValueVTs.push_back(TLI.getValueType(DL, Ty));
   if (Offsets)
     Offsets->push_back(StartingOffset);
 }
@@ -233,7 +233,8 @@ static bool isNoopBitcast(Type *T1, Type *T2,
 static const Value *getNoopInput(const Value *V,
                                  SmallVectorImpl<unsigned> &ValLoc,
                                  unsigned &DataBits,
-                                 const TargetLoweringBase &TLI) {
+                                 const TargetLoweringBase &TLI,
+                                 const DataLayout &DL) {
   while (true) {
     // Try to look through V1; if V1 is not an instruction, it can't be looked
     // through.
@@ -255,16 +256,16 @@ static const Value *getNoopInput(const Value *V,
       // Make sure this isn't a truncating or extending cast.  We could
       // support this eventually, but don't bother for now.
       if (!isa<VectorType>(I->getType()) &&
-          TLI.getPointerTy().getSizeInBits() ==
-          cast<IntegerType>(Op->getType())->getBitWidth())
+          DL.getPointerSizeInBits() ==
+              cast<IntegerType>(Op->getType())->getBitWidth())
         NoopInput = Op;
     } else if (isa<PtrToIntInst>(I)) {
       // Look through ptrtoint.
       // Make sure this isn't a truncating or extending cast.  We could
       // support this eventually, but don't bother for now.
       if (!isa<VectorType>(I->getType()) &&
-          TLI.getPointerTy().getSizeInBits() ==
-          cast<IntegerType>(I->getType())->getBitWidth())
+          DL.getPointerSizeInBits() ==
+              cast<IntegerType>(I->getType())->getBitWidth())
         NoopInput = Op;
     } else if (isa<TruncInst>(I) &&
                TLI.allowTruncateForTailCall(Op->getType(), I->getType())) {
@@ -331,14 +332,15 @@ static bool slotOnlyDiscardsData(const Value *RetVal, const Value *CallVal,
                                  SmallVectorImpl<unsigned> &RetIndices,
                                  SmallVectorImpl<unsigned> &CallIndices,
                                  bool AllowDifferingSizes,
-                                 const TargetLoweringBase &TLI) {
+                                 const TargetLoweringBase &TLI,
+                                 const DataLayout &DL) {
 
   // Trace the sub-value needed by the return value as far back up the graph as
   // possible, in the hope that it will intersect with the value produced by the
   // call. In the simple case with no "returned" attribute, the hope is actually
   // that we end up back at the tail call instruction itself.
   unsigned BitsRequired = UINT_MAX;
-  RetVal = getNoopInput(RetVal, RetIndices, BitsRequired, TLI);
+  RetVal = getNoopInput(RetVal, RetIndices, BitsRequired, TLI, DL);
 
   // If this slot in the value returned is undef, it doesn't matter what the
   // call puts there, it'll be fine.
@@ -350,7 +352,7 @@ static bool slotOnlyDiscardsData(const Value *RetVal, const Value *CallVal,
   // a "returned" attribute, the search will be blocked immediately and the loop
   // a Noop.
   unsigned BitsProvided = UINT_MAX;
-  CallVal = getNoopInput(CallVal, CallIndices, BitsProvided, TLI);
+  CallVal = getNoopInput(CallVal, CallIndices, BitsProvided, TLI, DL);
 
   // There's no hope if we can't actually trace them to (the same part of!) the
   // same value.
@@ -606,7 +608,8 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
     // Finally, we can check whether the value produced by the tail call at this
     // index is compatible with the value we return.
     if (!slotOnlyDiscardsData(RetVal, CallVal, TmpRetPath, TmpCallPath,
-                              AllowDifferingSizes, TLI))
+                              AllowDifferingSizes, TLI,
+                              F->getParent()->getDataLayout()))
       return false;
 
     CallEmpty  = !nextRealType(CallSubTypes, CallPath);
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 4cb460a7bbfc..0bad7954b980 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -69,24 +69,32 @@ void ARMException::beginFunction(const MachineFunction *MF) {
 ///
 void ARMException::endFunction(const MachineFunction *MF) {
   ARMTargetStreamer &ATS = getTargetStreamer();
+  const Function *F = MF->getFunction();
+  const Function *Per = nullptr;
+  if (F->hasPersonalityFn())
+    Per = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
+  assert(!MMI->getPersonality() || Per == MMI->getPersonality());
+  bool forceEmitPersonality =
+    F->hasPersonalityFn() && !isNoOpWithoutInvoke(classifyEHPersonality(Per)) &&
+    F->needsUnwindTableEntry();
+  bool shouldEmitPersonality = forceEmitPersonality ||
+    !MMI->getLandingPads().empty();
   if (!Asm->MF->getFunction()->needsUnwindTableEntry() &&
-      MMI->getLandingPads().empty())
+      !shouldEmitPersonality)
     ATS.emitCantUnwind();
-  else {
-    if (!MMI->getLandingPads().empty()) {
-      // Emit references to personality.
-      if (const Function *Personality = MMI->getPersonality()) {
-        MCSymbol *PerSym = Asm->getSymbol(Personality);
-        Asm->OutStreamer->EmitSymbolAttribute(PerSym, MCSA_Global);
-        ATS.emitPersonality(PerSym);
-      }
-
-      // Emit .handlerdata directive.
-      ATS.emitHandlerData();
-
-      // Emit actual exception table
-      emitExceptionTable();
+  else if (shouldEmitPersonality) {
+    // Emit references to personality.
+    if (Per) {
+      MCSymbol *PerSym = Asm->getSymbol(Per);
+      Asm->OutStreamer->EmitSymbolAttribute(PerSym, MCSA_Global);
+      ATS.emitPersonality(PerSym);
     }
+
+    // Emit .handlerdata directive.
+    ATS.emitHandlerData();
+
+    // Emit actual exception table
+    emitExceptionTable();
   }
 
   if (Asm->MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 8a7e9f991611..125047e7bbb5 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/JumpInstrTableInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -820,7 +819,7 @@ void AsmPrinter::EmitFunctionBody() {
         emitCFIInstruction(MI);
         break;
 
-      case TargetOpcode::FRAME_ALLOC:
+      case TargetOpcode::LOCAL_ESCAPE:
         emitFrameAlloc(MI);
         break;
 
@@ -1024,7 +1023,7 @@ bool AsmPrinter::doFinalization(Module &M) {
 
   // Emit visibility info for declarations
   for (const Function &F : M) {
-    if (!F.isDeclaration())
+    if (!F.isDeclarationForLinker())
       continue;
     GlobalValue::VisibilityTypes V = F.getVisibility();
     if (V == GlobalValue::DefaultVisibility)
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 0bc873e326be..2c212c7ecee1 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -89,6 +89,7 @@ void DwarfCFIException::endModule() {
 
 void DwarfCFIException::beginFunction(const MachineFunction *MF) {
   shouldEmitMoves = shouldEmitPersonality = shouldEmitLSDA = false;
+  const Function *F = MF->getFunction();
 
   // If any landing pads survive, we need an EH table.
   bool hasLandingPads = !MMI->getLandingPads().empty();
@@ -104,10 +105,24 @@ void DwarfCFIException::beginFunction(const MachineFunction *MF) {
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
-  const Function *Per = MMI->getPersonality();
-
-  shouldEmitPersonality = hasLandingPads &&
-    PerEncoding != dwarf::DW_EH_PE_omit && Per;
+  const Function *Per = nullptr;
+  if (F->hasPersonalityFn())
+    Per = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
+  assert(!MMI->getPersonality() || Per == MMI->getPersonality());
+
+  // Emit a personality function even when there are no landing pads
+  bool forceEmitPersonality =
+      // ...if a personality function is explicitly specified
+      F->hasPersonalityFn() &&
+      // ... and it's not known to be a noop in the absence of invokes
+      !isNoOpWithoutInvoke(classifyEHPersonality(Per)) &&
+      // ... and we're not explicitly asked not to emit it
+      F->needsUnwindTableEntry();
+
+  shouldEmitPersonality =
+      (forceEmitPersonality ||
+       (hasLandingPads && PerEncoding != dwarf::DW_EH_PE_omit)) &&
+      Per;
 
   unsigned LSDAEncoding = TLOF.getLSDAEncoding();
   shouldEmitLSDA = shouldEmitPersonality &&
@@ -123,6 +138,11 @@ void DwarfCFIException::beginFunction(const MachineFunction *MF) {
   if (!shouldEmitPersonality)
     return;
 
+  // If we are forced to emit this personality, make sure to record
+  // it because it might not appear in any landingpad
+  if (forceEmitPersonality)
+    MMI->addPersonality(Per);
+
   const MCSymbol *Sym =
       TLOF.getCFIPersonalitySymbol(Per, *Asm->Mang, Asm->TM, MMI);
   Asm->OutStreamer->EmitCFIPersonality(Sym, PerEncoding);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 1c3e2aec64ab..01f34c6eb81c 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -49,7 +49,7 @@ class DwarfUnit;
 class MachineModuleInfo;
 
 //===----------------------------------------------------------------------===//
-/// \brief This class is used to record source line correspondence.
+/// This class is used to record source line correspondence.
 class SrcLineInfo {
   unsigned Line;     // Source line number.
   unsigned Column;   // Source column.
@@ -161,7 +161,7 @@ public:
 
     return dwarf::DW_TAG_variable;
   }
-  /// \brief Return true if DbgVariable is artificial.
+  /// Return true if DbgVariable is artificial.
   bool isArtificial() const {
     if (Var->isArtificial())
       return true;
@@ -190,149 +190,152 @@ public:
   const DIType *getType() const;
 
 private:
-  /// resolve - Look in the DwarfDebug map for the MDNode that
+  /// Look in the DwarfDebug map for the MDNode that
   /// corresponds to the reference.
   template <typename T> T *resolve(TypedDINodeRef<T> Ref) const;
 };
 
 
-/// \brief Helper used to pair up a symbol and its DWARF compile unit.
+/// Helper used to pair up a symbol and its DWARF compile unit.
 struct SymbolCU {
   SymbolCU(DwarfCompileUnit *CU, const MCSymbol *Sym) : Sym(Sym), CU(CU) {}
   const MCSymbol *Sym;
   DwarfCompileUnit *CU;
 };
 
-/// \brief Collects and handles dwarf debug information.
+/// Collects and handles dwarf debug information.
 class DwarfDebug : public AsmPrinterHandler {
-  // Target of Dwarf emission.
+  /// Target of Dwarf emission.
   AsmPrinter *Asm;
 
-  // Collected machine module information.
+  /// Collected machine module information.
   MachineModuleInfo *MMI;
 
-  // All DIEValues are allocated through this allocator.
+  /// All DIEValues are allocated through this allocator.
   BumpPtrAllocator DIEValueAllocator;
 
-  // Maps MDNode with its corresponding DwarfCompileUnit.
+  /// Maps MDNode with its corresponding DwarfCompileUnit.
   MapVector<const MDNode *, DwarfCompileUnit *> CUMap;
 
-  // Maps subprogram MDNode with its corresponding DwarfCompileUnit.
+  /// Maps subprogram MDNode with its corresponding DwarfCompileUnit.
   MapVector<const MDNode *, DwarfCompileUnit *> SPMap;
 
-  // Maps a CU DIE with its corresponding DwarfCompileUnit.
+  /// Maps a CU DIE with its corresponding DwarfCompileUnit.
   DenseMap<const DIE *, DwarfCompileUnit *> CUDieMap;
 
-  // List of all labels used in aranges generation.
+  /// List of all labels used in aranges generation.
   std::vector<SymbolCU> ArangeLabels;
 
-  // Size of each symbol emitted (for those symbols that have a specific size).
+  /// Size of each symbol emitted (for those symbols that have a specific size).
   DenseMap<const MCSymbol *, uint64_t> SymSize;
 
   LexicalScopes LScopes;
 
-  // Collection of abstract variables.
+  /// Collection of abstract variables.
   DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
   SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables;
 
-  // Collection of DebugLocEntry. Stored in a linked list so that DIELocLists
-  // can refer to them in spite of insertions into this list.
+  /// Collection of DebugLocEntry. Stored in a linked list so that DIELocLists
+  /// can refer to them in spite of insertions into this list.
   DebugLocStream DebugLocs;
 
-  // This is a collection of subprogram MDNodes that are processed to
-  // create DIEs.
+  /// This is a collection of subprogram MDNodes that are processed to
+  /// create DIEs.
   SmallPtrSet<const MDNode *, 16> ProcessedSPNodes;
 
-  // Maps instruction with label emitted before instruction.
+  /// Maps instruction with label emitted before instruction.
   DenseMap<const MachineInstr *, MCSymbol *> LabelsBeforeInsn;
 
-  // Maps instruction with label emitted after instruction.
+  /// Maps instruction with label emitted after instruction.
   DenseMap<const MachineInstr *, MCSymbol *> LabelsAfterInsn;
 
-  // History of DBG_VALUE and clobber instructions for each user variable.
-  // Variables are listed in order of appearance.
+  /// History of DBG_VALUE and clobber instructions for each user
+  /// variable.  Variables are listed in order of appearance.
   DbgValueHistoryMap DbgValues;
 
-  // Previous instruction's location information. This is used to determine
-  // label location to indicate scope boundries in dwarf debug info.
+  /// Previous instruction's location information. This is used to
+  /// determine label location to indicate scope boundries in dwarf
+  /// debug info.
   DebugLoc PrevInstLoc;
   MCSymbol *PrevLabel;
 
-  // This location indicates end of function prologue and beginning of function
-  // body.
+  /// This location indicates end of function prologue and beginning of
+  /// function body.
   DebugLoc PrologEndLoc;
 
-  // If nonnull, stores the current machine function we're processing.
+  /// If nonnull, stores the current machine function we're processing.
   const MachineFunction *CurFn;
 
-  // If nonnull, stores the current machine instruction we're processing.
+  /// If nonnull, stores the current machine instruction we're processing.
   const MachineInstr *CurMI;
 
-  // If nonnull, stores the CU in which the previous subprogram was contained.
+  /// If nonnull, stores the CU in which the previous subprogram was contained.
   const DwarfCompileUnit *PrevCU;
 
-  // As an optimization, there is no need to emit an entry in the directory
-  // table for the same directory as DW_AT_comp_dir.
+  /// As an optimization, there is no need to emit an entry in the directory
+  /// table for the same directory as DW_AT_comp_dir.
   StringRef CompilationDir;
 
-  // Holder for the file specific debug information.
+  /// Holder for the file specific debug information.
   DwarfFile InfoHolder;
 
-  // Holders for the various debug information flags that we might need to
-  // have exposed. See accessor functions below for description.
+  /// Holders for the various debug information flags that we might need to
+  /// have exposed. See accessor functions below for description.
 
-  // Holder for imported entities.
+  /// Holder for imported entities.
   typedef SmallVector<std::pair<const MDNode *, const MDNode *>, 32>
   ImportedEntityMap;
   ImportedEntityMap ScopesWithImportedEntities;
 
-  // Map from MDNodes for user-defined types to the type units that describe
-  // them.
+  /// Map from MDNodes for user-defined types to the type units that
+  /// describe them.
   DenseMap<const MDNode *, const DwarfTypeUnit *> DwarfTypeUnits;
 
   SmallVector<
       std::pair<std::unique_ptr<DwarfTypeUnit>, const DICompositeType *>, 1>
       TypeUnitsUnderConstruction;
 
-  // Whether to emit the pubnames/pubtypes sections.
+  /// Whether to emit the pubnames/pubtypes sections.
   bool HasDwarfPubSections;
 
-  // Whether or not to use AT_ranges for compilation units.
+  /// Whether or not to use AT_ranges for compilation units.
   bool HasCURanges;
 
-  // Whether we emitted a function into a section other than the default
-  // text.
+  /// Whether we emitted a function into a section other than the
+  /// default text.
   bool UsedNonDefaultText;
 
-  // Whether to use the GNU TLS opcode (instead of the standard opcode).
+  /// Whether to use the GNU TLS opcode (instead of the standard opcode).
   bool UseGNUTLSOpcode;
 
-  // Version of dwarf we're emitting.
+  /// Version of dwarf we're emitting.
   unsigned DwarfVersion;
 
-  // Maps from a type identifier to the actual MDNode.
+  /// Maps from a type identifier to the actual MDNode.
   DITypeIdentifierMap TypeIdentifierMap;
 
-  // DWARF5 Experimental Options
+  /// DWARF5 Experimental Options
+  /// @{
   bool HasDwarfAccelTables;
   bool HasSplitDwarf;
 
-  // Separated Dwarf Variables
-  // In general these will all be for bits that are left in the
-  // original object file, rather than things that are meant
-  // to be in the .dwo sections.
+  /// Separated Dwarf Variables
+  /// In general these will all be for bits that are left in the
+  /// original object file, rather than things that are meant
+  /// to be in the .dwo sections.
 
-  // Holder for the skeleton information.
+  /// Holder for the skeleton information.
   DwarfFile SkeletonHolder;
 
-  /// Store file names for type units under fission in a line table header that
-  /// will be emitted into debug_line.dwo.
-  // FIXME: replace this with a map from comp_dir to table so that we can emit
-  // multiple tables during LTO each of which uses directory 0, referencing the
-  // comp_dir of all the type units that use it.
+  /// Store file names for type units under fission in a line table
+  /// header that will be emitted into debug_line.dwo.
+  // FIXME: replace this with a map from comp_dir to table so that we
+  // can emit multiple tables during LTO each of which uses directory
+  // 0, referencing the comp_dir of all the type units that use it.
   MCDwarfDwoLineTable SplitTypeUnitFileTable;
-
-  // True iff there are multiple CUs in this module.
+  /// @}
+  
+  /// True iff there are multiple CUs in this module.
   bool SingleCU;
   bool IsDarwin;
   bool IsPS4;
@@ -354,7 +357,7 @@ class DwarfDebug : public AsmPrinterHandler {
 
   typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
 
-  /// \brief Find abstract variable associated with Var.
+  /// Find abstract variable associated with Var.
   DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
                                            const DILocalVariable *&Cleansed);
   DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
@@ -366,56 +369,56 @@ class DwarfDebug : public AsmPrinterHandler {
 
   DbgVariable *createConcreteVariable(LexicalScope &Scope, InlinedVariable IV);
 
-  /// \brief Construct a DIE for this abstract scope.
+  /// Construct a DIE for this abstract scope.
   void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
-  /// \brief Compute the size and offset of a DIE given an incoming Offset.
+  /// Compute the size and offset of a DIE given an incoming Offset.
   unsigned computeSizeAndOffset(DIE *Die, unsigned Offset);
 
-  /// \brief Compute the size and offset of all the DIEs.
+  /// Compute the size and offset of all the DIEs.
   void computeSizeAndOffsets();
 
-  /// \brief Collect info for variables that were optimized out.
+  /// Collect info for variables that were optimized out.
   void collectDeadVariables();
 
   void finishVariableDefinitions();
 
   void finishSubprogramDefinitions();
 
-  /// \brief Finish off debug information after all functions have been
+  /// Finish off debug information after all functions have been
   /// processed.
   void finalizeModuleInfo();
 
-  /// \brief Emit the debug info section.
+  /// Emit the debug info section.
   void emitDebugInfo();
 
-  /// \brief Emit the abbreviation section.
+  /// Emit the abbreviation section.
   void emitAbbreviations();
 
-  /// \brief Emit a specified accelerator table.
+  /// Emit a specified accelerator table.
   void emitAccel(DwarfAccelTable &Accel, MCSection *Section,
                  StringRef TableName);
 
-  /// \brief Emit visible names into a hashed accelerator table section.
+  /// Emit visible names into a hashed accelerator table section.
   void emitAccelNames();
 
-  /// \brief Emit objective C classes and categories into a hashed
+  /// Emit objective C classes and categories into a hashed
   /// accelerator table section.
   void emitAccelObjC();
 
-  /// \brief Emit namespace dies into a hashed accelerator table.
+  /// Emit namespace dies into a hashed accelerator table.
   void emitAccelNamespaces();
 
-  /// \brief Emit type dies into a hashed accelerator table.
+  /// Emit type dies into a hashed accelerator table.
   void emitAccelTypes();
 
-  /// \brief Emit visible names into a debug pubnames section.
+  /// Emit visible names into a debug pubnames section.
   /// \param GnuStyle determines whether or not we want to emit
   /// additional information into the table ala newer gcc for gdb
   /// index.
   void emitDebugPubNames(bool GnuStyle = false);
 
-  /// \brief Emit visible types into a debug pubtypes section.
+  /// Emit visible types into a debug pubtypes section.
   /// \param GnuStyle determines whether or not we want to emit
   /// additional information into the table ala newer gcc for gdb
   /// index.
@@ -425,91 +428,91 @@ class DwarfDebug : public AsmPrinterHandler {
       bool GnuStyle, MCSection *PSec, StringRef Name,
       const StringMap<const DIE *> &(DwarfCompileUnit::*Accessor)() const);
 
-  /// \brief Emit visible names into a debug str section.
+  /// Emit visible names into a debug str section.
   void emitDebugStr();
 
-  /// \brief Emit visible names into a debug loc section.
+  /// Emit visible names into a debug loc section.
   void emitDebugLoc();
 
-  /// \brief Emit visible names into a debug loc dwo section.
+  /// Emit visible names into a debug loc dwo section.
   void emitDebugLocDWO();
 
-  /// \brief Emit visible names into a debug aranges section.
+  /// Emit visible names into a debug aranges section.
   void emitDebugARanges();
 
-  /// \brief Emit visible names into a debug ranges section.
+  /// Emit visible names into a debug ranges section.
   void emitDebugRanges();
 
-  /// \brief Emit inline info using custom format.
+  /// Emit inline info using custom format.
   void emitDebugInlineInfo();
 
   /// DWARF 5 Experimental Split Dwarf Emitters
 
-  /// \brief Initialize common features of skeleton units.
+  /// Initialize common features of skeleton units.
   void initSkeletonUnit(const DwarfUnit &U, DIE &Die,
                         std::unique_ptr<DwarfUnit> NewU);
 
-  /// \brief Construct the split debug info compile unit for the debug info
+  /// Construct the split debug info compile unit for the debug info
   /// section.
   DwarfCompileUnit &constructSkeletonCU(const DwarfCompileUnit &CU);
 
-  /// \brief Construct the split debug info compile unit for the debug info
+  /// Construct the split debug info compile unit for the debug info
   /// section.
   DwarfTypeUnit &constructSkeletonTU(DwarfTypeUnit &TU);
 
-  /// \brief Emit the debug info dwo section.
+  /// Emit the debug info dwo section.
   void emitDebugInfoDWO();
 
-  /// \brief Emit the debug abbrev dwo section.
+  /// Emit the debug abbrev dwo section.
   void emitDebugAbbrevDWO();
 
-  /// \brief Emit the debug line dwo section.
+  /// Emit the debug line dwo section.
   void emitDebugLineDWO();
 
-  /// \brief Emit the debug str dwo section.
+  /// Emit the debug str dwo section.
   void emitDebugStrDWO();
 
   /// Flags to let the linker know we have emitted new style pubnames. Only
   /// emit it here if we don't have a skeleton CU for split dwarf.
   void addGnuPubAttributes(DwarfUnit &U, DIE &D) const;
 
-  /// \brief Create new DwarfCompileUnit for the given metadata node with tag
+  /// Create new DwarfCompileUnit for the given metadata node with tag
   /// DW_TAG_compile_unit.
   DwarfCompileUnit &constructDwarfCompileUnit(const DICompileUnit *DIUnit);
 
-  /// \brief Construct imported_module or imported_declaration DIE.
+  /// Construct imported_module or imported_declaration DIE.
   void constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU,
                                         const DIImportedEntity *N);
 
-  /// \brief Register a source line with debug info. Returns the unique
+  /// Register a source line with debug info. Returns the unique
   /// label that was emitted and which provides correspondence to the
   /// source line list.
   void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope,
                         unsigned Flags);
 
-  /// \brief Indentify instructions that are marking the beginning of or
+  /// Indentify instructions that are marking the beginning of or
   /// ending of a scope.
   void identifyScopeMarkers();
 
-  /// \brief Populate LexicalScope entries with variables' info.
+  /// Populate LexicalScope entries with variables' info.
   void collectVariableInfo(DwarfCompileUnit &TheCU, const DISubprogram *SP,
                            DenseSet<InlinedVariable> &ProcessedVars);
 
-  /// \brief Build the location list for all DBG_VALUEs in the
+  /// Build the location list for all DBG_VALUEs in the
   /// function that describe the same variable.
   void buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
                          const DbgValueHistoryMap::InstrRanges &Ranges);
 
-  /// \brief Collect variable information from the side table maintained
+  /// Collect variable information from the side table maintained
   /// by MMI.
   void collectVariableInfoFromMMITable(DenseSet<InlinedVariable> &P);
 
-  /// \brief Ensure that a label will be emitted before MI.
+  /// Ensure that a label will be emitted before MI.
   void requestLabelBeforeInsn(const MachineInstr *MI) {
     LabelsBeforeInsn.insert(std::make_pair(MI, nullptr));
   }
 
-  /// \brief Ensure that a label will be emitted after MI.
+  /// Ensure that a label will be emitted after MI.
   void requestLabelAfterInsn(const MachineInstr *MI) {
     LabelsAfterInsn.insert(std::make_pair(MI, nullptr));
   }
@@ -522,50 +525,50 @@ public:
 
   ~DwarfDebug() override;
 
-  /// \brief Emit all Dwarf sections that should come prior to the
+  /// Emit all Dwarf sections that should come prior to the
   /// content.
   void beginModule();
 
-  /// \brief Emit all Dwarf sections that should come after the content.
+  /// Emit all Dwarf sections that should come after the content.
   void endModule() override;
 
-  /// \brief Gather pre-function debug information.
+  /// Gather pre-function debug information.
   void beginFunction(const MachineFunction *MF) override;
 
-  /// \brief Gather and emit post-function debug information.
+  /// Gather and emit post-function debug information.
   void endFunction(const MachineFunction *MF) override;
 
-  /// \brief Process beginning of an instruction.
+  /// Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;
 
-  /// \brief Process end of an instruction.
+  /// Process end of an instruction.
   void endInstruction() override;
 
-  /// \brief Add a DIE to the set of types that we're going to pull into
+  /// Add a DIE to the set of types that we're going to pull into
   /// type units.
   void addDwarfTypeUnitType(DwarfCompileUnit &CU, StringRef Identifier,
                             DIE &Die, const DICompositeType *CTy);
 
-  /// \brief Add a label so that arange data can be generated for it.
+  /// Add a label so that arange data can be generated for it.
   void addArangeLabel(SymbolCU SCU) { ArangeLabels.push_back(SCU); }
 
-  /// \brief For symbols that have a size designated (e.g. common symbols),
+  /// For symbols that have a size designated (e.g. common symbols),
   /// this tracks that size.
   void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {
     SymSize[Sym] = Size;
   }
 
-  /// \brief Returns whether to use DW_OP_GNU_push_tls_address, instead of the
+  /// Returns whether to use DW_OP_GNU_push_tls_address, instead of the
   /// standard DW_OP_form_tls_address opcode
   bool useGNUTLSOpcode() const { return UseGNUTLSOpcode; }
 
   // Experimental DWARF5 features.
 
-  /// \brief Returns whether or not to emit tables that dwarf consumers can
+  /// Returns whether or not to emit tables that dwarf consumers can
   /// use to accelerate lookup.
   bool useDwarfAccelTables() const { return HasDwarfAccelTables; }
 
-  /// \brief Returns whether or not to change the current debug info for the
+  /// Returns whether or not to change the current debug info for the
   /// split dwarf proposal support.
   bool useSplitDwarf() const { return HasSplitDwarf; }
 
@@ -579,7 +582,7 @@ public:
   /// Returns the entries for the .debug_loc section.
   const DebugLocStream &getDebugLocs() const { return DebugLocs; }
 
-  /// \brief Emit an entry for the debug loc section. This can be used to
+  /// Emit an entry for the debug loc section. This can be used to
   /// handle an entry that's going to be emitted into the debug loc section.
   void emitDebugLocEntry(ByteStreamer &Streamer,
                          const DebugLocStream::Entry &Entry);
@@ -592,7 +595,7 @@ public:
     return Ref.resolve(TypeIdentifierMap);
   }
 
-  /// \brief Return the TypeIdentifierMap.
+  /// Return the TypeIdentifierMap.
   const DITypeIdentifierMap &getTypeIdentifierMap() const {
     return TypeIdentifierMap;
   }
@@ -627,14 +630,14 @@ public:
         less_first()));
   }
 
-  /// \brief A helper function to check whether the DIE for a given Scope is
+  /// A helper function to check whether the DIE for a given Scope is
   /// going to be null.
   bool isLexicalScopeDIENull(LexicalScope *Scope);
 
-  /// \brief Return Label preceding the instruction.
+  /// Return Label preceding the instruction.
   MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
 
-  /// \brief Return Label immediately following the instruction.
+  /// Return Label immediately following the instruction.
   MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
 
   // FIXME: Sink these functions down into DwarfFile/Dwarf*Unit.
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 4000ae48a856..44d9d2245dda 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -113,7 +113,7 @@ protected:
   DwarfUnit(unsigned UID, dwarf::Tag, const DICompileUnit *CU, AsmPrinter *A,
             DwarfDebug *DW, DwarfFile *DWU);
 
-  /// \brief Add a string attribute data and value.
+  /// Add a string attribute data and value.
   ///
   /// This is guaranteed to be in the local string pool instead of indirected.
   void addLocalString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
@@ -142,10 +142,10 @@ public:
   unsigned getDebugInfoOffset() const { return DebugInfoOffset; }
   void setDebugInfoOffset(unsigned DbgInfoOff) { DebugInfoOffset = DbgInfoOff; }
 
-  /// \brief Return true if this compile unit has something to write out.
+  /// Return true if this compile unit has something to write out.
   bool hasContent() const { return UnitDie.hasChildren(); }
 
-  /// \brief Get string containing language specific context for a global name.
+  /// Get string containing language specific context for a global name.
   ///
   /// Walks the metadata parent chain in a language specific manner (using the
   /// compile unit language) and returns it as a string. This is done at the
@@ -162,42 +162,42 @@ public:
   virtual void addGlobalType(const DIType *Ty, const DIE &Die,
                              const DIScope *Context) {}
 
-  /// \brief Add a new name to the namespace accelerator table.
+  /// Add a new name to the namespace accelerator table.
   void addAccelNamespace(StringRef Name, const DIE &Die);
 
-  /// \brief Returns the DIE map slot for the specified debug variable.
+  /// Returns the DIE map slot for the specified debug variable.
   ///
   /// We delegate the request to DwarfDebug when the MDNode can be part of the
   /// type system, since DIEs for the type system can be shared across CUs and
   /// the mappings are kept in DwarfDebug.
   DIE *getDIE(const DINode *D) const;
 
-  /// \brief Returns a fresh newly allocated DIELoc.
+  /// Returns a fresh newly allocated DIELoc.
   DIELoc *getDIELoc() { return new (DIEValueAllocator) DIELoc; }
 
-  /// \brief Insert DIE into the map.
+  /// Insert DIE into the map.
   ///
   /// We delegate the request to DwarfDebug when the MDNode can be part of the
   /// type system, since DIEs for the type system can be shared across CUs and
   /// the mappings are kept in DwarfDebug.
   void insertDIE(const DINode *Desc, DIE *D);
 
-  /// \brief Add a flag that is true to the DIE.
+  /// Add a flag that is true to the DIE.
   void addFlag(DIE &Die, dwarf::Attribute Attribute);
 
-  /// \brief Add an unsigned integer attribute data and value.
+  /// Add an unsigned integer attribute data and value.
   void addUInt(DIE &Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
                uint64_t Integer);
 
   void addUInt(DIE &Block, dwarf::Form Form, uint64_t Integer);
 
-  /// \brief Add an signed integer attribute data and value.
+  /// Add an signed integer attribute data and value.
   void addSInt(DIE &Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
                int64_t Integer);
 
   void addSInt(DIELoc &Die, Optional<dwarf::Form> Form, int64_t Integer);
 
-  /// \brief Add a string attribute data and value.
+  /// Add a string attribute data and value.
   ///
   /// We always emit a reference to the string pool instead of immediate
   /// strings so that DIEs have more predictable sizes. In the case of split
@@ -205,38 +205,38 @@ public:
   /// into the string table.
   void addString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
 
-  /// \brief Add a Dwarf label attribute data and value.
+  /// Add a Dwarf label attribute data and value.
   DIE::value_iterator addLabel(DIE &Die, dwarf::Attribute Attribute,
                                dwarf::Form Form, const MCSymbol *Label);
 
   void addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label);
 
-  /// \brief Add an offset into a section attribute data and value.
+  /// Add an offset into a section attribute data and value.
   void addSectionOffset(DIE &Die, dwarf::Attribute Attribute, uint64_t Integer);
 
-  /// \brief Add a dwarf op address data and value using the form given and an
+  /// Add a dwarf op address data and value using the form given and an
   /// op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
   void addOpAddress(DIELoc &Die, const MCSymbol *Label);
 
-  /// \brief Add a label delta attribute data and value.
+  /// Add a label delta attribute data and value.
   void addLabelDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
                      const MCSymbol *Lo);
 
-  /// \brief Add a DIE attribute data and value.
+  /// Add a DIE attribute data and value.
   void addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIE &Entry);
 
-  /// \brief Add a DIE attribute data and value.
+  /// Add a DIE attribute data and value.
   void addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIEEntry Entry);
 
   void addDIETypeSignature(DIE &Die, const DwarfTypeUnit &Type);
 
-  /// \brief Add block data.
+  /// Add block data.
   void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block);
 
-  /// \brief Add block data.
+  /// Add block data.
   void addBlock(DIE &Die, dwarf::Attribute Attribute, DIEBlock *Block);
 
-  /// \brief Add location information to specified debug information entry.
+  /// Add location information to specified debug information entry.
   void addSourceLine(DIE &Die, unsigned Line, StringRef File,
                      StringRef Directory);
   void addSourceLine(DIE &Die, const DILocalVariable *V);
@@ -246,30 +246,30 @@ public:
   void addSourceLine(DIE &Die, const DINamespace *NS);
   void addSourceLine(DIE &Die, const DIObjCProperty *Ty);
 
-  /// \brief Add constant value entry in variable DIE.
+  /// Add constant value entry in variable DIE.
   void addConstantValue(DIE &Die, const MachineOperand &MO, const DIType *Ty);
   void addConstantValue(DIE &Die, const ConstantInt *CI, const DIType *Ty);
   void addConstantValue(DIE &Die, const APInt &Val, const DIType *Ty);
   void addConstantValue(DIE &Die, const APInt &Val, bool Unsigned);
   void addConstantValue(DIE &Die, bool Unsigned, uint64_t Val);
 
-  /// \brief Add constant value entry in variable DIE.
+  /// Add constant value entry in variable DIE.
   void addConstantFPValue(DIE &Die, const MachineOperand &MO);
   void addConstantFPValue(DIE &Die, const ConstantFP *CFP);
 
-  /// \brief Add a linkage name, if it isn't empty.
+  /// Add a linkage name, if it isn't empty.
   void addLinkageName(DIE &Die, StringRef LinkageName);
 
-  /// \brief Add template parameters in buffer.
+  /// Add template parameters in buffer.
   void addTemplateParams(DIE &Buffer, DINodeArray TParams);
 
-  /// \brief Add register operand.
+  /// Add register operand.
   /// \returns false if the register does not exist, e.g., because it was never
   /// materialized.
   bool addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
                           unsigned SizeInBits = 0, unsigned OffsetInBits = 0);
 
-  /// \brief Add register offset.
+  /// Add register offset.
   /// \returns false if the register does not exist, e.g., because it was never
   /// materialized.
   bool addRegisterOffset(DIELoc &TheDie, unsigned Reg, int64_t Offset);
@@ -283,7 +283,7 @@ public:
                             dwarf::Attribute Attribute,
                             const MachineLocation &Location);
 
-  /// \brief Add a new type attribute to the specified entity.
+  /// Add a new type attribute to the specified entity.
   ///
   /// This takes and attribute parameter because DW_AT_friend attributes are
   /// also type references.
@@ -297,19 +297,19 @@ public:
   void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
                                  bool Minimal = false);
 
-  /// \brief Find existing DIE or create new DIE for the given type.
+  /// Find existing DIE or create new DIE for the given type.
   DIE *getOrCreateTypeDIE(const MDNode *N);
 
-  /// \brief Get context owner's DIE.
+  /// Get context owner's DIE.
   DIE *createTypeDIE(const DICompositeType *Ty);
 
-  /// \brief Get context owner's DIE.
+  /// Get context owner's DIE.
   DIE *getOrCreateContextDIE(const DIScope *Context);
 
-  /// \brief Construct DIEs for types that contain vtables.
+  /// Construct DIEs for types that contain vtables.
   void constructContainingTypeDIEs();
 
-  /// \brief Construct function argument DIEs.
+  /// Construct function argument DIEs.
   void constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args);
 
   /// Create a DIE with the given Tag, add the DIE to its parent, and
@@ -332,14 +332,14 @@ public:
   void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy);
 
 protected:
-  /// \brief Create new static data member DIE.
+  /// Create new static data member DIE.
   DIE *getOrCreateStaticMemberDIE(const DIDerivedType *DT);
 
   /// Look up the source ID with the given directory and source file names. If
   /// none currently exists, create a new ID and insert it in the line table.
   virtual unsigned getOrCreateSourceID(StringRef File, StringRef Directory) = 0;
 
-  /// \brief Look in the DwarfDebug map for the MDNode that corresponds to the
+  /// Look in the DwarfDebug map for the MDNode that corresponds to the
   /// reference.
   template <typename T> T *resolve(TypedDINodeRef<T> Ref) const {
     return DD->resolve(Ref);
@@ -358,15 +358,15 @@ private:
   void constructTemplateValueParameterDIE(DIE &Buffer,
                                           const DITemplateValueParameter *TVP);
 
-  /// \brief Return the default lower bound for an array.
+  /// Return the default lower bound for an array.
   ///
   /// If the DWARF version doesn't handle the language, return -1.
   int64_t getDefaultLowerBound() const;
 
-  /// \brief Get an anonymous type for index type.
+  /// Get an anonymous type for index type.
   DIE *getIndexTyDie();
 
-  /// \brief Set D as anonymous type for index which can be reused later.
+  /// Set D as anonymous type for index which can be reused later.
   void setIndexTyDie(DIE *D) { IndexTyDie = D; }
 
   /// If this is a named finished type then include it in the list of types for
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 1be3fd74d602..49ef8d3ddc8f 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -309,7 +309,7 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
   // If some instruction between the previous try-range and the end of the
   // function may throw, create a call-site entry with no landing pad for the
   // region following the try-range.
-  if (SawPotentiallyThrowing && !IsSJLJ) {
+  if (SawPotentiallyThrowing && !IsSJLJ && LastLabel != nullptr) {
     CallSiteEntry Site = { LastLabel, nullptr, nullptr, 0 };
     CallSites.push_back(Site);
   }
diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
index 535b1f605853..6610ac78f8c4 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
@@ -97,7 +97,7 @@ void WinCodeViewLineTables::maybeRecordLocation(DebugLoc DL,
   MCSymbol *MCL = Asm->MMI->getContext().createTempSymbol();
   Asm->OutStreamer->EmitLabel(MCL);
   CurFn->Instrs.push_back(MCL);
-  InstrInfo[MCL] = InstrInfoTy(Filename, DL.getLine());
+  InstrInfo[MCL] = InstrInfoTy(Filename, DL.getLine(), DL.getCol());
 }
 
 WinCodeViewLineTables::WinCodeViewLineTables(AsmPrinter *AP)
@@ -264,22 +264,38 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
   // Identify the function this subsection is for.
   Asm->OutStreamer->EmitCOFFSecRel32(Fn);
   Asm->OutStreamer->EmitCOFFSectionIndex(Fn);
-  // Insert padding after a 16-bit section index.
-  Asm->EmitInt16(0);
+  // Insert flags after a 16-bit section index.
+  Asm->EmitInt16(COFF::DEBUG_LINE_TABLES_HAVE_COLUMN_RECORDS);
 
   // Length of the function's code, in bytes.
   EmitLabelDiff(*Asm->OutStreamer, Fn, FI.End);
 
   // PC-to-linenumber lookup table:
   MCSymbol *FileSegmentEnd = nullptr;
+
+  // The start of the last segment:
+  size_t LastSegmentStart = 0;
+
+  auto FinishPreviousChunk = [&] {
+    if (!FileSegmentEnd)
+      return;
+    for (size_t ColSegI = LastSegmentStart,
+                ColSegEnd = ColSegI + FilenameSegmentLengths[LastSegmentStart];
+         ColSegI != ColSegEnd; ++ColSegI) {
+      unsigned ColumnNumber = InstrInfo[FI.Instrs[ColSegI]].ColumnNumber;
+      Asm->EmitInt16(ColumnNumber); // Start column
+      Asm->EmitInt16(ColumnNumber); // End column
+    }
+    Asm->OutStreamer->EmitLabel(FileSegmentEnd);
+  };
+
   for (size_t J = 0, F = FI.Instrs.size(); J != F; ++J) {
     MCSymbol *Instr = FI.Instrs[J];
     assert(InstrInfo.count(Instr));
 
     if (FilenameSegmentLengths.count(J)) {
       // We came to a beginning of a new filename segment.
-      if (FileSegmentEnd)
-        Asm->OutStreamer->EmitLabel(FileSegmentEnd);
+      FinishPreviousChunk();
       StringRef CurFilename = InstrInfo[FI.Instrs[J]].Filename;
       assert(FileNameRegistry.Infos.count(CurFilename));
       size_t IndexInStringTable =
@@ -300,6 +316,7 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
       // records.
       FileSegmentEnd = Asm->MMI->getContext().createTempSymbol();
       EmitLabelDiff(*Asm->OutStreamer, FileSegmentBegin, FileSegmentEnd);
+      LastSegmentStart = J;
     }
 
     // The first PC with the given linenumber and the linenumber itself.
@@ -307,8 +324,7 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
     Asm->EmitInt32(InstrInfo[Instr].LineNumber);
   }
 
-  if (FileSegmentEnd)
-    Asm->OutStreamer->EmitLabel(FileSegmentEnd);
+  FinishPreviousChunk();
   Asm->OutStreamer->EmitLabel(LineTableEnd);
 }
 
diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
index a5b399f73707..43d1a432712e 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
@@ -52,11 +52,13 @@ class LLVM_LIBRARY_VISIBILITY WinCodeViewLineTables : public AsmPrinterHandler {
   struct InstrInfoTy {
     StringRef Filename;
     unsigned LineNumber;
+    unsigned ColumnNumber;
 
-    InstrInfoTy() : LineNumber(0) {}
+    InstrInfoTy() : LineNumber(0), ColumnNumber(0) {}
 
-    InstrInfoTy(StringRef Filename, unsigned LineNumber)
-        : Filename(Filename), LineNumber(LineNumber) {}
+    InstrInfoTy(StringRef Filename, unsigned LineNumber, unsigned ColumnNumber)
+        : Filename(Filename), LineNumber(LineNumber),
+          ColumnNumber(ColumnNumber) {}
   };
   DenseMap<MCSymbol *, InstrInfoTy> InstrInfo;
 
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 79830bc3443b..71c77815e281 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -70,19 +70,27 @@ void WinException::beginFunction(const MachineFunction *MF) {
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
-  const Function *Per = MMI->getPersonality();
+  const Function *Per = nullptr;
+  if (F->hasPersonalityFn())
+    Per = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
 
-  shouldEmitPersonality = hasLandingPads &&
-    PerEncoding != dwarf::DW_EH_PE_omit && Per;
+  bool forceEmitPersonality =
+    F->hasPersonalityFn() && !isNoOpWithoutInvoke(classifyEHPersonality(Per)) &&
+    F->needsUnwindTableEntry();
+
+  shouldEmitPersonality = forceEmitPersonality || (hasLandingPads &&
+    PerEncoding != dwarf::DW_EH_PE_omit && Per);
 
   unsigned LSDAEncoding = TLOF.getLSDAEncoding();
   shouldEmitLSDA = shouldEmitPersonality &&
     LSDAEncoding != dwarf::DW_EH_PE_omit;
 
-  // If we're not using CFI, we don't want the CFI or the personality. Emit the
-  // LSDA if this is the parent function.
+  // If we're not using CFI, we don't want the CFI or the personality. If
+  // WinEHPrepare outlined something, we should emit the LSDA.
   if (!Asm->MAI->usesWindowsCFI()) {
-    shouldEmitLSDA = (hasLandingPads && F == ParentF);
+    bool HasOutlinedChildren =
+        F->hasFnAttribute("wineh-parent") && F == ParentF;
+    shouldEmitLSDA = HasOutlinedChildren;
     shouldEmitPersonality = false;
     return;
   }
@@ -121,7 +129,10 @@ void WinException::endFunction(const MachineFunction *MF) {
   if (!shouldEmitPersonality && !shouldEmitMoves && !shouldEmitLSDA)
     return;
 
-  EHPersonality Per = MMI->getPersonalityType();
+  const Function *F = MF->getFunction();
+  EHPersonality Per = EHPersonality::Unknown;
+  if (F->hasPersonalityFn())
+    Per = classifyEHPersonality(F->getPersonalityFn());
 
   // Get rid of any dead landing pads if we're not using a Windows EH scheme. In
   // Windows EH schemes, the landing pad is not actually reachable. It only
@@ -350,6 +361,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   // EHFlags & 1 -> Synchronous exceptions only, no async exceptions.
   // EHFlags & 2 -> ???
   // EHFlags & 4 -> The function is noexcept(true), unwinding can't continue.
+  OS.EmitValueToAlignment(4);
   OS.EmitLabel(FuncInfoXData);
   OS.EmitIntValue(0x19930522, 4);                      // MagicNumber
   OS.EmitIntValue(FuncInfo.UnwindMap.size(), 4);       // MaxState
@@ -555,7 +567,7 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
   // we've code generated the parent, we can emit the label assignment that
   // those helpers use to get the offset of the registration node.
   assert(FuncInfo.EHRegNodeEscapeIndex != INT_MAX &&
-         "no EH reg node frameescape index");
+         "no EH reg node localescape index");
   MCSymbol *ParentFrameOffset =
       Asm->OutContext.getOrCreateParentFrameOffsetSymbol(FLinkageName);
   MCSymbol *RegistrationOffsetSym = Asm->OutContext.getOrCreateFrameAllocSymbol(
@@ -578,9 +590,11 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
 
   // Emit the __ehtable label that we use for llvm.x86.seh.lsda.
   MCSymbol *LSDALabel = Asm->OutContext.getOrCreateLSDASymbol(FLinkageName);
+  OS.EmitValueToAlignment(4);
   OS.EmitLabel(LSDALabel);
 
-  const Function *Per = MMI->getPersonality();
+  const Function *Per =
+      dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
   StringRef PerName = Per->getName();
   int BaseState = -1;
   if (PerName == "_except_handler4") {
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index 82f5c482408a..db00910cd018 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -34,4 +34,5 @@ cl::opt<unsigned>
                                     cl::Hidden);
 
 BasicTTIImpl::BasicTTIImpl(const TargetMachine *TM, Function &F)
-    : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
+    : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      TLI(ST->getTargetLowering()) {}
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 70de4e7ebd11..6ab6acc03722 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -147,10 +147,13 @@ class TypePromotionTransaction;
     /// OptSize - True if optimizing for size.
     bool OptSize;
 
+    /// DataLayout for the Function being processed.
+    const DataLayout *DL;
+
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit CodeGenPrepare(const TargetMachine *TM = nullptr)
-        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr) {
+        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr) {
         initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
       }
     bool runOnFunction(Function &F) override;
@@ -203,6 +206,8 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   if (skipOptnoneFunction(F))
     return false;
 
+  DL = &F.getParent()->getDataLayout();
+
   bool EverMadeChange = false;
   // Clear per function information.
   InsertedInsts.clear();
@@ -753,10 +758,11 @@ static bool SinkCast(CastInst *CI) {
 ///
 /// Return true if any changes are made.
 ///
-static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
+static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
+                                       const DataLayout &DL) {
   // If this is a noop copy,
-  EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
-  EVT DstVT = TLI.getValueType(CI->getType());
+  EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(DL, CI->getType());
 
   // This is an fp<->int conversion?
   if (SrcVT.isInteger() != DstVT.isInteger())
@@ -921,7 +927,7 @@ static bool isExtractBitsCandidateUse(Instruction *User) {
 static bool
 SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
                      DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
-                     const TargetLowering &TLI) {
+                     const TargetLowering &TLI, const DataLayout &DL) {
   BasicBlock *UserBB = User->getParent();
   DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
   TruncInst *TruncI = dyn_cast<TruncInst>(User);
@@ -947,7 +953,7 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
     // approximation; some nodes' legality is determined by the
     // operand or other means. There's no good way to find out though.
     if (TLI.isOperationLegalOrCustom(
-            ISDOpcode, TLI.getValueType(TruncUser->getType(), true)))
+            ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))
       continue;
 
     // Don't bother for PHI nodes.
@@ -1005,13 +1011,14 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
 /// instruction.
 /// Return true if any changes are made.
 static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
-                                const TargetLowering &TLI) {
+                                const TargetLowering &TLI,
+                                const DataLayout &DL) {
   BasicBlock *DefBB = ShiftI->getParent();
 
   /// Only insert instructions in each block once.
   DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
 
-  bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(ShiftI->getType()));
+  bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));
 
   bool MadeChange = false;
   for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
@@ -1048,9 +1055,10 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
       if (isa<TruncInst>(User) && shiftIsLegal
           // If the type of the truncate is legal, no trucate will be
           // introduced in other basic blocks.
-          && (!TLI.isTypeLegal(TLI.getValueType(User->getType()))))
+          &&
+          (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
         MadeChange =
-            SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI);
+            SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);
 
       continue;
     }
@@ -1307,12 +1315,10 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       return true;
   }
 
-  const DataLayout *TD = TLI ? TLI->getDataLayout() : nullptr;
-
   // Align the pointer arguments to this call if the target thinks it's a good
   // idea
   unsigned MinSize, PrefAlign;
-  if (TLI && TD && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
+  if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
     for (auto &Arg : CI->arg_operands()) {
       // We want to align both objects whose address is used directly and
       // objects whose address is used in casts and GEPs, though it only makes
@@ -1320,36 +1326,34 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       // if size - offset meets the size threshold.
       if (!Arg->getType()->isPointerTy())
         continue;
-      APInt Offset(TD->getPointerSizeInBits(
-                     cast<PointerType>(Arg->getType())->getAddressSpace()), 0);
-      Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*TD, Offset);
+      APInt Offset(DL->getPointerSizeInBits(
+                       cast<PointerType>(Arg->getType())->getAddressSpace()),
+                   0);
+      Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
       uint64_t Offset2 = Offset.getLimitedValue();
       if ((Offset2 & (PrefAlign-1)) != 0)
         continue;
       AllocaInst *AI;
-      if ((AI = dyn_cast<AllocaInst>(Val)) &&
-          AI->getAlignment() < PrefAlign &&
-          TD->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
+      if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
+          DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
         AI->setAlignment(PrefAlign);
       // Global variables can only be aligned if they are defined in this
       // object (i.e. they are uniquely initialized in this object), and
       // over-aligning global variables that have an explicit section is
       // forbidden.
       GlobalVariable *GV;
-      if ((GV = dyn_cast<GlobalVariable>(Val)) &&
-          GV->hasUniqueInitializer() &&
-          !GV->hasSection() &&
-          GV->getAlignment() < PrefAlign &&
-          TD->getTypeAllocSize(
-            GV->getType()->getElementType()) >= MinSize + Offset2)
+      if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->hasUniqueInitializer() &&
+          !GV->hasSection() && GV->getAlignment() < PrefAlign &&
+          DL->getTypeAllocSize(GV->getType()->getElementType()) >=
+              MinSize + Offset2)
         GV->setAlignment(PrefAlign);
     }
     // If this is a memcpy (or similar) then we may be able to improve the
     // alignment
     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
-      unsigned Align = getKnownAlignment(MI->getDest(), *TD);
+      unsigned Align = getKnownAlignment(MI->getDest(), *DL);
       if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
-        Align = std::min(Align, getKnownAlignment(MTI->getSource(), *TD));
+        Align = std::min(Align, getKnownAlignment(MTI->getSource(), *DL));
       if (Align > MI->getAlignment())
         MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align));
     }
@@ -2099,6 +2103,7 @@ class AddressingModeMatcher {
   SmallVectorImpl<Instruction*> &AddrModeInsts;
   const TargetMachine &TM;
   const TargetLowering &TLI;
+  const DataLayout &DL;
 
   /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
   /// the memory instruction that we're computing this address for.
@@ -2131,8 +2136,9 @@ class AddressingModeMatcher {
       : AddrModeInsts(AMI), TM(TM),
         TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent())
                  ->getTargetLowering()),
-        AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM),
-        InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT) {
+        DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
+        MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
+        PromotedInsts(PromotedInsts), TPT(TPT) {
     IgnoreProfitability = false;
   }
 public:
@@ -2199,7 +2205,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
   TestAddrMode.ScaledReg = ScaleReg;
 
   // If the new address isn't legal, bail out.
-  if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy, AddrSpace))
+  if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))
     return false;
 
   // It was legal, so commit it.
@@ -2216,7 +2222,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
 
     // If this addressing mode is legal, commit it and remember that we folded
     // this instruction.
-    if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy, AddrSpace)) {
+    if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
       AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
       AddrMode = TestAddrMode;
       return true;
@@ -2262,7 +2268,8 @@ static bool MightBeFoldableInst(Instruction *I) {
 /// \note \p Val is assumed to be the product of some type promotion.
 /// Therefore if \p Val has an undefined state in \p TLI, this is assumed
 /// to be legal, as the non-promoted value would have had the same state.
-static bool isPromotedInstructionLegal(const TargetLowering &TLI, Value *Val) {
+static bool isPromotedInstructionLegal(const TargetLowering &TLI,
+                                       const DataLayout &DL, Value *Val) {
   Instruction *PromotedInst = dyn_cast<Instruction>(Val);
   if (!PromotedInst)
     return false;
@@ -2272,7 +2279,7 @@ static bool isPromotedInstructionLegal(const TargetLowering &TLI, Value *Val) {
     return true;
   // Otherwise, check if the promoted instruction is legal or not.
   return TLI.isOperationLegalOrCustom(
-      ISDOpcode, TLI.getValueType(PromotedInst->getType()));
+      ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));
 }
 
 /// \brief Hepler class to perform type promotion.
@@ -2646,7 +2653,7 @@ bool AddressingModeMatcher::IsPromotionProfitable(
   // The promotion is neutral but it may help folding the sign extension in
   // loads for instance.
   // Check that we did not create an illegal instruction.
-  return isPromotedInstructionLegal(TLI, PromotedOperand);
+  return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
 }
 
 /// MatchOperationAddr - Given an instruction or constant expr, see if we can
@@ -2674,12 +2681,14 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
   case Instruction::PtrToInt:
     // PtrToInt is always a noop, as we know that the int type is pointer sized.
     return MatchAddr(AddrInst->getOperand(0), Depth);
-  case Instruction::IntToPtr:
+  case Instruction::IntToPtr: {
+    auto AS = AddrInst->getType()->getPointerAddressSpace();
+    auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
     // This inttoptr is a no-op if the integer type is pointer sized.
-    if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
-        TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace()))
+    if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
       return MatchAddr(AddrInst->getOperand(0), Depth);
     return false;
+  }
   case Instruction::BitCast:
     // BitCast is always a noop, and we can handle it as long as it is
     // int->int or pointer->pointer (we don't want int<->fp or something).
@@ -2752,16 +2761,15 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     unsigned VariableScale = 0;
 
     int64_t ConstantOffset = 0;
-    const DataLayout *TD = TLI.getDataLayout();
     gep_type_iterator GTI = gep_type_begin(AddrInst);
     for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        const StructLayout *SL = TD->getStructLayout(STy);
+        const StructLayout *SL = DL.getStructLayout(STy);
         unsigned Idx =
           cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
         ConstantOffset += SL->getElementOffset(Idx);
       } else {
-        uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType());
+        uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
         if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
           ConstantOffset += CI->getSExtValue()*TypeSize;
         } else if (TypeSize) {  // Scales of zero don't do anything.
@@ -2781,7 +2789,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     if (VariableOperand == -1) {
       AddrMode.BaseOffs += ConstantOffset;
       if (ConstantOffset == 0 ||
-          TLI.isLegalAddressingMode(AddrMode, AccessTy, AddrSpace)) {
+          TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) {
         // Check to see if we can fold the base pointer in too.
         if (MatchAddr(AddrInst->getOperand(0), Depth+1))
           return true;
@@ -2904,14 +2912,14 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
     // Fold in immediates if legal for the target.
     AddrMode.BaseOffs += CI->getSExtValue();
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy, AddrSpace))
+    if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
       return true;
     AddrMode.BaseOffs -= CI->getSExtValue();
   } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
     // If this is a global variable, try to fold it into the addressing mode.
     if (!AddrMode.BaseGV) {
       AddrMode.BaseGV = GV;
-      if (TLI.isLegalAddressingMode(AddrMode, AccessTy, AddrSpace))
+      if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
         return true;
       AddrMode.BaseGV = nullptr;
     }
@@ -2955,7 +2963,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
     AddrMode.HasBaseReg = true;
     AddrMode.BaseReg = Addr;
     // Still check for legality in case the target supports [imm] but not [i+r].
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy, AddrSpace))
+    if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
       return true;
     AddrMode.HasBaseReg = false;
     AddrMode.BaseReg = nullptr;
@@ -2965,7 +2973,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
   if (AddrMode.Scale == 0) {
     AddrMode.Scale = 1;
     AddrMode.ScaledReg = Addr;
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy, AddrSpace))
+    if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
       return true;
     AddrMode.Scale = 0;
     AddrMode.ScaledReg = nullptr;
@@ -2984,7 +2992,8 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
   const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering();
   const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo();
   TargetLowering::AsmOperandInfoVector TargetConstraints =
-      TLI->ParseConstraints(TRI, ImmutableCallSite(CI));
+      TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI,
+                            ImmutableCallSite(CI));
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
 
@@ -3324,7 +3333,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst << "\n");
-    Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType());
+    Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
     Value *ResultPtr = nullptr, *ResultIndex = nullptr;
 
     // First, find the pointer.
@@ -3443,7 +3452,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   } else {
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst << "\n");
-    Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType());
+    Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
     Value *Result = nullptr;
 
     // Start with the base register. Do this first so that subsequent address
@@ -3545,8 +3554,8 @@ bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
 
   const TargetRegisterInfo *TRI =
       TM->getSubtargetImpl(*CS->getParent()->getParent())->getRegisterInfo();
-  TargetLowering::AsmOperandInfoVector
-    TargetConstraints = TLI->ParseConstraints(TRI, CS);
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI->ParseConstraints(*DL, TRI, CS);
   unsigned ArgNo = 0;
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
@@ -3680,7 +3689,7 @@ bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT,
     TotalCreatedInstsCost -= ExtCost;
     if (!StressExtLdPromotion &&
         (TotalCreatedInstsCost > 1 ||
-         !isPromotedInstructionLegal(*TLI, PromotedVal))) {
+         !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
       // The promotion is not profitable, rollback to the previous state.
       TPT.rollback(LastKnownGood);
       continue;
@@ -3735,8 +3744,8 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) {
   if (!HasPromoted && LI->getParent() == I->getParent())
     return false;
 
-  EVT VT = TLI->getValueType(I->getType());
-  EVT LoadVT = TLI->getValueType(LI->getType());
+  EVT VT = TLI->getValueType(*DL, I->getType());
+  EVT LoadVT = TLI->getValueType(*DL, LI->getType());
 
   // If the load has other users and the truncate is not free, this probably
   // isn't worthwhile.
@@ -4013,6 +4022,9 @@ namespace {
 /// Assuming both extractelement and store can be combine, we get rid of the
 /// transition.
 class VectorPromoteHelper {
+  /// DataLayout associated with the current module.
+  const DataLayout &DL;
+
   /// Used to perform some checks on the legality of vector operations.
   const TargetLowering &TLI;
 
@@ -4086,7 +4098,8 @@ class VectorPromoteHelper {
     unsigned Align = ST->getAlignment();
     // Check if this store is supported.
     if (!TLI.allowsMisalignedMemoryAccesses(
-            TLI.getValueType(ST->getValueOperand()->getType()), AS, Align)) {
+            TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,
+            Align)) {
       // If this is not supported, there is no way we can combine
       // the extract with the store.
       return false;
@@ -4181,9 +4194,10 @@ class VectorPromoteHelper {
   }
 
 public:
-  VectorPromoteHelper(const TargetLowering &TLI, const TargetTransformInfo &TTI,
-                      Instruction *Transition, unsigned CombineCost)
-      : TLI(TLI), TTI(TTI), Transition(Transition),
+  VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
+                      const TargetTransformInfo &TTI, Instruction *Transition,
+                      unsigned CombineCost)
+      : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
         StoreExtractCombineCost(CombineCost), CombineInst(nullptr) {
     assert(Transition && "Do not know how to promote null");
   }
@@ -4219,7 +4233,7 @@ public:
       return false;
     return StressStoreExtract ||
            TLI.isOperationLegalOrCustom(
-               ISDOpcode, TLI.getValueType(getTransitionType(), true));
+               ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
   }
 
   /// \brief Check whether or not \p Use can be combined
@@ -4323,7 +4337,7 @@ bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
   //      we do not do that for now.
   BasicBlock *Parent = Inst->getParent();
   DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
-  VectorPromoteHelper VPH(*TLI, *TTI, Inst, CombineCost);
+  VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);
   // If the transition has more than one use, assume this is not going to be
   // beneficial.
   while (Inst->hasOneUse()) {
@@ -4368,8 +4382,7 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
     // trivial PHI, go ahead and zap it here.
-    const DataLayout &DL = I->getModule()->getDataLayout();
-    if (Value *V = SimplifyInstruction(P, DL, TLInfo, nullptr)) {
+    if (Value *V = SimplifyInstruction(P, *DL, TLInfo, nullptr)) {
       P->replaceAllUsesWith(V);
       P->eraseFromParent();
       ++NumPHIsElim;
@@ -4388,15 +4401,16 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
     if (isa<Constant>(CI->getOperand(0)))
       return false;
 
-    if (TLI && OptimizeNoopCopyExpression(CI, *TLI))
+    if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL))
       return true;
 
     if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
       /// Sink a zext or sext into its user blocks if the target type doesn't
       /// fit in one register
-      if (TLI && TLI->getTypeAction(CI->getContext(),
-                                    TLI->getValueType(CI->getType())) ==
-                     TargetLowering::TypeExpandInteger) {
+      if (TLI &&
+          TLI->getTypeAction(CI->getContext(),
+                             TLI->getValueType(*DL, CI->getType())) ==
+              TargetLowering::TypeExpandInteger) {
         return SinkCast(CI);
       } else {
         bool MadeChange = MoveExtToFormExtLoad(I);
@@ -4433,7 +4447,7 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
                 BinOp->getOpcode() == Instruction::LShr)) {
     ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
     if (TLI && CI && TLI->hasExtractBitsInsn())
-      return OptimizeExtractBits(BinOp, CI, *TLI);
+      return OptimizeExtractBits(BinOp, CI, *TLI, *DL);
 
     return false;
   }
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index 963d573ea7f0..941129b5cc95 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -60,7 +60,7 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
     return false;
 
   // Don't delete frame allocation labels.
-  if (MI->getOpcode() == TargetOpcode::FRAME_ALLOC)
+  if (MI->getOpcode() == TargetOpcode::LOCAL_ESCAPE)
     return false;
 
   // Don't delete instructions with side effects.
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 5b09cf1a0fd7..201f9c150083 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -733,12 +733,14 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
   // If no relevant registers are used in the function, we can skip it
   // completely.
   bool anyregs = false;
+  const MachineRegisterInfo &MRI = mf.getRegInfo();
   for (TargetRegisterClass::const_iterator I = RC->begin(), E = RC->end();
-       I != E; ++I)
-    if (MF->getRegInfo().isPhysRegUsed(*I)) {
-      anyregs = true;
-      break;
-    }
+       I != E && !anyregs; ++I)
+    for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI)
+      if (!MRI.reg_nodbg_empty(*AI)) {
+        anyregs = true;
+        break;
+      }
   if (!anyregs) return false;
 
   // Initialize the AliasMap on the first use.
diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
index 37b3bf17ed1f..6f9e8394081e 100644
--- a/lib/CodeGen/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp
@@ -117,7 +117,6 @@ STATISTIC(NumMerged, "Number of globals merged");
 namespace {
   class GlobalMerge : public FunctionPass {
     const TargetMachine *TM;
-    const DataLayout *DL;
     // FIXME: Infer the maximum possible offset depending on the actual users
     // (these max offsets are different for the users inside Thumb or ARM
     // functions), see the code that passes in the offset in the ARM backend
@@ -160,8 +159,8 @@ namespace {
     explicit GlobalMerge(const TargetMachine *TM = nullptr,
                          unsigned MaximalOffset = 0,
                          bool OnlyOptimizeForSize = false)
-        : FunctionPass(ID), TM(TM), DL(TM->getDataLayout()),
-          MaxOffset(MaximalOffset), OnlyOptimizeForSize(OnlyOptimizeForSize) {
+        : FunctionPass(ID), TM(TM), MaxOffset(MaximalOffset),
+          OnlyOptimizeForSize(OnlyOptimizeForSize) {
       initializeGlobalMergePass(*PassRegistry::getPassRegistry());
     }
 
@@ -188,14 +187,16 @@ INITIALIZE_PASS_END(GlobalMerge, "global-merge", "Merge global variables",
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
+  auto &DL = M.getDataLayout();
   // FIXME: Find better heuristics
-  std::stable_sort(Globals.begin(), Globals.end(),
-                   [this](const GlobalVariable *GV1, const GlobalVariable *GV2) {
-    Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
-    Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
+  std::stable_sort(
+      Globals.begin(), Globals.end(),
+      [&DL](const GlobalVariable *GV1, const GlobalVariable *GV2) {
+        Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
+        Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
 
-    return (DL->getTypeAllocSize(Ty1) < DL->getTypeAllocSize(Ty2));
-  });
+        return (DL.getTypeAllocSize(Ty1) < DL.getTypeAllocSize(Ty2));
+      });
 
   // If we want to just blindly group all globals together, do so.
   if (!GlobalMergeGroupByUse) {
@@ -410,6 +411,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
                           unsigned AddrSpace) const {
 
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
+  auto &DL = M.getDataLayout();
 
   assert(Globals.size() > 1);
 
@@ -427,7 +429,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
     GlobalVariable *TheFirstExternal = 0;
     for (j = i; j != -1; j = GlobalSet.find_next(j)) {
       Type *Ty = Globals[j]->getType()->getElementType();
-      MergedSize += DL->getTypeAllocSize(Ty);
+      MergedSize += DL.getTypeAllocSize(Ty);
       if (MergedSize > MaxOffset) {
         break;
       }
@@ -526,6 +528,7 @@ bool GlobalMerge::doInitialization(Module &M) {
   if (!EnableGlobalMerge)
     return false;
 
+  auto &DL = M.getDataLayout();
   DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
                                                         BSSGlobals;
   bool Changed = false;
@@ -548,9 +551,9 @@ bool GlobalMerge::doInitialization(Module &M) {
     unsigned AddressSpace = PT->getAddressSpace();
 
     // Ignore fancy-aligned globals for now.
-    unsigned Alignment = DL->getPreferredAlignment(I);
+    unsigned Alignment = DL.getPreferredAlignment(I);
     Type *Ty = I->getType()->getElementType();
-    if (Alignment > DL->getABITypeAlignment(Ty))
+    if (Alignment > DL.getABITypeAlignment(Ty))
       continue;
 
     // Ignore all 'special' globals.
@@ -562,7 +565,7 @@ bool GlobalMerge::doInitialization(Module &M) {
     if (isMustKeepGlobalVariable(I))
       continue;
 
-    if (DL->getTypeAllocSize(Ty) < MaxOffset) {
+    if (DL.getTypeAllocSize(Ty) < MaxOffset) {
       if (TargetLoweringObjectFile::getKindForGlobal(I, *TM).isBSSLocal())
         BSSGlobals[AddressSpace].push_back(I);
       else if (I->isConstant())
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index a02cd67ac649..93e04876a8f3 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -25,9 +25,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -47,6 +50,11 @@ static cl::opt<unsigned> PageSize("imp-null-check-page-size",
                                            "bytes"),
                                   cl::init(4096));
 
+#define DEBUG_TYPE "implicit-null-checks"
+
+STATISTIC(NumImplicitNullChecks,
+          "Number of explicit null checks made implicit");
+
 namespace {
 
 class ImplicitNullChecks : public MachineFunctionPass {
@@ -171,6 +179,9 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   //   callq throw_NullPointerException
   //
   //  LblNotNull:
+  //   Inst0
+  //   Inst1
+  //   ...
   //   Def = Load (%RAX + <offset>)
   //   ...
   //
@@ -181,6 +192,8 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   //   jmp LblNotNull ;; explicit or fallthrough
   //
   //  LblNotNull:
+  //   Inst0
+  //   Inst1
   //   ...
   //
   //  LblNull:
@@ -188,15 +201,75 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   //
 
   unsigned PointerReg = MBP.LHS.getReg();
-  MachineInstr *MemOp = &*NotNullSucc->begin();
-  unsigned BaseReg, Offset;
-  if (TII->getMemOpBaseRegImmOfs(MemOp, BaseReg, Offset, TRI))
-    if (MemOp->mayLoad() && !MemOp->isPredicable() && BaseReg == PointerReg &&
-        Offset < PageSize && MemOp->getDesc().getNumDefs() == 1) {
-      NullCheckList.emplace_back(MemOp, MBP.ConditionDef, &MBB, NotNullSucc,
-                                 NullSucc);
-      return true;
+
+  // As we scan NotNullSucc for a suitable load instruction, we keep track of
+  // the registers defined and used by the instructions we scan past.  This bit
+  // of information lets us decide if it is legal to hoist the load instruction
+  // we find (if we do find such an instruction) to before NotNullSucc.
+  DenseSet<unsigned> RegDefs, RegUses;
+
+  // Returns true if it is safe to reorder MI to before NotNullSucc.
+  auto IsSafeToHoist = [&](MachineInstr *MI) {
+    // Right now we don't want to worry about LLVM's memory model.  This can be
+    // made more precise later.
+    for (auto *MMO : MI->memoperands())
+      if (!MMO->isUnordered())
+        return false;
+
+    for (auto &MO : MI->operands()) {
+      if (MO.isReg() && MO.getReg()) {
+        for (unsigned Reg : RegDefs)
+          if (TRI->regsOverlap(Reg, MO.getReg()))
+            return false;  // We found a write-after-write or read-after-write
+
+        if (MO.isDef())
+          for (unsigned Reg : RegUses)
+            if (TRI->regsOverlap(Reg, MO.getReg()))
+              return false;  // We found a write-after-read
+      }
+    }
+
+    return true;
+  };
+
+  for (auto MII = NotNullSucc->begin(), MIE = NotNullSucc->end(); MII != MIE;
+       ++MII) {
+    MachineInstr *MI = &*MII;
+    unsigned BaseReg, Offset;
+    if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
+      if (MI->mayLoad() && !MI->isPredicable() && BaseReg == PointerReg &&
+          Offset < PageSize && MI->getDesc().getNumDefs() == 1 &&
+          IsSafeToHoist(MI)) {
+        NullCheckList.emplace_back(MI, MBP.ConditionDef, &MBB, NotNullSucc,
+                                   NullSucc);
+        return true;
+      }
+
+    // MI did not match our criteria for conversion to a trapping load.  Check
+    // if we can continue looking.
+
+    if (MI->mayStore() || MI->hasUnmodeledSideEffects())
+      return false;
+
+    for (auto *MMO : MI->memoperands())
+      // Right now we don't want to worry about LLVM's memory model.
+      if (!MMO->isUnordered())
+        return false;
+
+    // It _may_ be okay to reorder a later load instruction across MI.  Make a
+    // note of its operands so that we can make the legality check if we find a
+    // suitable load instruction:
+
+    for (auto &MO : MI->operands()) {
+      if (!MO.isReg() || !MO.getReg())
+        continue;
+
+      if (MO.isDef())
+        RegDefs.insert(MO.getReg());
+      else
+        RegUses.insert(MO.getReg());
     }
+  }
 
   return false;
 }
@@ -247,7 +320,7 @@ void ImplicitNullChecks::rewriteNullChecks(
     // touch the successors list for any basic block since we haven't changed
     // control flow, we've just made it implicit.
     insertFaultingLoad(NC.MemOperation, NC.CheckBlock, HandlerLabel);
-    NC.MemOperation->removeFromParent();
+    NC.MemOperation->eraseFromParent();
     NC.CheckOperation->eraseFromParent();
 
     // Insert an *unconditional* branch to not-null successor.
@@ -257,6 +330,8 @@ void ImplicitNullChecks::rewriteNullChecks(
     // Emit the HandlerLabel as an EH_LABEL.
     BuildMI(*NC.NullSucc, NC.NullSucc->begin(), DL,
             TII->get(TargetOpcode::EH_LABEL)).addSym(HandlerLabel);
+
+    NumImplicitNullChecks++;
   }
 }
 
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index b486bdc91453..37299eb664cf 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -90,8 +90,8 @@ TargetIRAnalysis LLVMTargetMachine::getTargetIRAnalysis() {
 /// addPassesToX helper drives creation and initialization of TargetPassConfig.
 static MCContext *
 addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
-                        bool DisableVerify, AnalysisID StartAfter,
-                        AnalysisID StopAfter,
+                        bool DisableVerify, AnalysisID StartBefore,
+                        AnalysisID StartAfter, AnalysisID StopAfter,
                         MachineFunctionInitializer *MFInitializer = nullptr) {
 
   // Add internal analysis passes from the target machine.
@@ -100,7 +100,7 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
   // Targets may override createPassConfig to provide a target-specific
   // subclass.
   TargetPassConfig *PassConfig = TM->createPassConfig(PM);
-  PassConfig->setStartStopPasses(StartAfter, StopAfter);
+  PassConfig->setStartStopPasses(StartBefore, StartAfter, StopAfter);
 
   // Set PassConfig options provided by TargetMachine.
   PassConfig->setDisableVerify(DisableVerify);
@@ -143,11 +143,12 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
 
 bool LLVMTargetMachine::addPassesToEmitFile(
     PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType,
-    bool DisableVerify, AnalysisID StartAfter, AnalysisID StopAfter,
-    MachineFunctionInitializer *MFInitializer) {
+    bool DisableVerify, AnalysisID StartBefore, AnalysisID StartAfter,
+    AnalysisID StopAfter, MachineFunctionInitializer *MFInitializer) {
   // Add common CodeGen passes.
-  MCContext *Context = addPassesToGenerateCode(
-      this, PM, DisableVerify, StartAfter, StopAfter, MFInitializer);
+  MCContext *Context =
+      addPassesToGenerateCode(this, PM, DisableVerify, StartBefore, StartAfter,
+                              StopAfter, MFInitializer);
   if (!Context)
     return true;
 
@@ -231,7 +232,8 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
                                           raw_pwrite_stream &Out,
                                           bool DisableVerify) {
   // Add common CodeGen passes.
-  Ctx = addPassesToGenerateCode(this, PM, DisableVerify, nullptr, nullptr);
+  Ctx = addPassesToGenerateCode(this, PM, DisableVerify, nullptr, nullptr,
+                                nullptr);
   if (!Ctx)
     return true;
 
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index 154ce6fc122b..000151acd735 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -15,12 +15,12 @@
 #include "RegisterCoalescer.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -49,7 +49,6 @@ void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {
 
 bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
-  MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
   VRM = &getAnalysis<VirtRegMap>();
 
@@ -101,7 +100,6 @@ void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
                << " to " << PrintReg(PhysReg, TRI) << ':');
   assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
   VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
-  MRI->setPhysRegUsed(PhysReg);
 
   foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
                                          const LiveRange &Range) {
@@ -131,6 +129,14 @@ void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
   DEBUG(dbgs() << '\n');
 }
 
+bool LiveRegMatrix::isPhysRegUsed(unsigned PhysReg) const {
+  for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) {
+    if (!Matrix[*Unit].empty())
+      return true;
+  }
+  return false;
+}
+
 bool LiveRegMatrix::checkRegMaskInterference(LiveInterval &VirtReg,
                                              unsigned PhysReg) {
   // Check if the cached information is valid.
diff --git a/lib/CodeGen/MIRParser/MILexer.cpp b/lib/CodeGen/MIRParser/MILexer.cpp
index e9b3916a11fa..482c33ae2235 100644
--- a/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/lib/CodeGen/MIRParser/MILexer.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MILexer.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include <cctype>
 
@@ -64,6 +65,17 @@ static bool isIdentifierChar(char C) {
   return isalpha(C) || isdigit(C) || C == '_' || C == '-' || C == '.';
 }
 
+static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
+  return StringSwitch<MIToken::TokenKind>(Identifier)
+      .Case("_", MIToken::underscore)
+      .Case("implicit", MIToken::kw_implicit)
+      .Case("implicit-def", MIToken::kw_implicit_define)
+      .Case("dead", MIToken::kw_dead)
+      .Case("killed", MIToken::kw_killed)
+      .Case("undef", MIToken::kw_undef)
+      .Default(MIToken::Identifier);
+}
+
 static Cursor maybeLexIdentifier(Cursor C, MIToken &Token) {
   if (!isalpha(C.peek()) && C.peek() != '_')
     return None;
@@ -71,8 +83,7 @@ static Cursor maybeLexIdentifier(Cursor C, MIToken &Token) {
   while (isIdentifierChar(C.peek()))
     C.advance();
   auto Identifier = Range.upto(C);
-  Token = MIToken(Identifier == "_" ? MIToken::underscore : MIToken::Identifier,
-                  Identifier);
+  Token = MIToken(getIdentifierKind(Identifier), Identifier);
   return C;
 }
 
@@ -104,9 +115,22 @@ static Cursor maybeLexMachineBasicBlock(
   return C;
 }
 
+static Cursor lexVirtualRegister(Cursor C, MIToken &Token) {
+  auto Range = C;
+  C.advance(); // Skip '%'
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  Token = MIToken(MIToken::VirtualRegister, Range.upto(C),
+                  APSInt(NumberRange.upto(C)));
+  return C;
+}
+
 static Cursor maybeLexRegister(Cursor C, MIToken &Token) {
   if (C.peek() != '%')
     return None;
+  if (isdigit(C.peek(1)))
+    return lexVirtualRegister(C, Token);
   auto Range = C;
   C.advance(); // Skip '%'
   while (isIdentifierChar(C.peek()))
@@ -155,6 +179,8 @@ static MIToken::TokenKind symbolToken(char C) {
     return MIToken::comma;
   case '=':
     return MIToken::equal;
+  case ':':
+    return MIToken::colon;
   default:
     return MIToken::Error;
   }
diff --git a/lib/CodeGen/MIRParser/MILexer.h b/lib/CodeGen/MIRParser/MILexer.h
index c28935f38909..55460b56e7d6 100644
--- a/lib/CodeGen/MIRParser/MILexer.h
+++ b/lib/CodeGen/MIRParser/MILexer.h
@@ -35,6 +35,14 @@ struct MIToken {
     comma,
     equal,
     underscore,
+    colon,
+
+    // Keywords
+    kw_implicit,
+    kw_implicit_define,
+    kw_dead,
+    kw_killed,
+    kw_undef,
 
     // Identifier tokens
     Identifier,
@@ -44,7 +52,8 @@ struct MIToken {
     GlobalValue,
 
     // Other tokens
-    IntegerLiteral
+    IntegerLiteral,
+    VirtualRegister
   };
 
 private:
@@ -66,7 +75,13 @@ public:
   bool isError() const { return Kind == Error; }
 
   bool isRegister() const {
-    return Kind == NamedRegister || Kind == underscore;
+    return Kind == NamedRegister || Kind == underscore ||
+           Kind == VirtualRegister;
+  }
+
+  bool isRegisterFlag() const {
+    return Kind == kw_implicit || Kind == kw_implicit_define ||
+           Kind == kw_dead || Kind == kw_killed || Kind == kw_undef;
   }
 
   bool is(TokenKind K) const { return Kind == K; }
@@ -81,7 +96,7 @@ public:
 
   bool hasIntegerValue() const {
     return Kind == IntegerLiteral || Kind == MachineBasicBlock ||
-           Kind == GlobalValue;
+           Kind == GlobalValue || Kind == VirtualRegister;
   }
 };
 
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index b618e53b8e43..c00011288a60 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SourceMgr.h"
@@ -28,14 +29,25 @@ using namespace llvm;
 
 namespace {
 
+/// A wrapper struct around the 'MachineOperand' struct that includes a source
+/// range.
+struct MachineOperandWithLocation {
+  MachineOperand Operand;
+  StringRef::iterator Begin;
+  StringRef::iterator End;
+
+  MachineOperandWithLocation(const MachineOperand &Operand,
+                             StringRef::iterator Begin, StringRef::iterator End)
+      : Operand(Operand), Begin(Begin), End(End) {}
+};
+
 class MIParser {
   SourceMgr &SM;
   MachineFunction &MF;
   SMDiagnostic &Error;
   StringRef Source, CurrentSource;
   MIToken Token;
-  /// Maps from basic block numbers to MBBs.
-  const DenseMap<unsigned, MachineBasicBlock *> &MBBSlots;
+  const PerFunctionMIParsingState &PFS;
   /// Maps from indices to unnamed global values and metadata nodes.
   const SlotMapping &IRSlots;
   /// Maps from instruction names to op codes.
@@ -44,11 +56,12 @@ class MIParser {
   StringMap<unsigned> Names2Regs;
   /// Maps from register mask names to register masks.
   StringMap<const uint32_t *> Names2RegMasks;
+  /// Maps from subregister names to subregister indices.
+  StringMap<unsigned> Names2SubRegIndices;
 
 public:
   MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error,
-           StringRef Source,
-           const DenseMap<unsigned, MachineBasicBlock *> &MBBSlots,
+           StringRef Source, const PerFunctionMIParsingState &PFS,
            const SlotMapping &IRSlots);
 
   void lex();
@@ -65,8 +78,11 @@ public:
 
   bool parse(MachineInstr *&MI);
   bool parseMBB(MachineBasicBlock *&MBB);
+  bool parseNamedRegister(unsigned &Reg);
 
   bool parseRegister(unsigned &Reg);
+  bool parseRegisterFlag(unsigned &Flags);
+  bool parseSubRegisterIndex(unsigned &SubReg);
   bool parseRegisterOperand(MachineOperand &Dest, bool IsDef = false);
   bool parseImmediateOperand(MachineOperand &Dest);
   bool parseMBBReference(MachineBasicBlock *&MBB);
@@ -88,6 +104,9 @@ private:
 
   bool parseInstruction(unsigned &OpCode);
 
+  bool verifyImplicitOperands(ArrayRef<MachineOperandWithLocation> Operands,
+                              const MCInstrDesc &MCID);
+
   void initNames2Regs();
 
   /// Try to convert a register name to a register number. Return true if the
@@ -100,17 +119,22 @@ private:
   ///
   /// Return null if the identifier isn't a register mask.
   const uint32_t *getRegMask(StringRef Identifier);
+
+  void initNames2SubRegIndices();
+
+  /// Check if the given identifier is a name of a subregister index.
+  ///
+  /// Return 0 if the name isn't a subregister index class.
+  unsigned getSubRegIndex(StringRef Name);
 };
 
 } // end anonymous namespace
 
 MIParser::MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error,
-                   StringRef Source,
-                   const DenseMap<unsigned, MachineBasicBlock *> &MBBSlots,
+                   StringRef Source, const PerFunctionMIParsingState &PFS,
                    const SlotMapping &IRSlots)
     : SM(SM), MF(MF), Error(Error), Source(Source), CurrentSource(Source),
-      Token(MIToken::Error, StringRef()), MBBSlots(MBBSlots), IRSlots(IRSlots) {
-}
+      Token(MIToken::Error, StringRef()), PFS(PFS), IRSlots(IRSlots) {}
 
 void MIParser::lex() {
   CurrentSource = lexMIToken(
@@ -121,8 +145,6 @@ void MIParser::lex() {
 bool MIParser::error(const Twine &Msg) { return error(Token.location(), Msg); }
 
 bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) {
-  // TODO: Get the proper location in the MIR file, not just a location inside
-  // the string.
   assert(Loc >= Source.data() && Loc <= (Source.data() + Source.size()));
   Error = SMDiagnostic(
       SM, SMLoc(),
@@ -137,11 +159,12 @@ bool MIParser::parse(MachineInstr *&MI) {
   // Parse any register operands before '='
   // TODO: Allow parsing of multiple operands before '='
   MachineOperand MO = MachineOperand::CreateImm(0);
-  SmallVector<MachineOperand, 8> Operands;
-  if (Token.isRegister()) {
+  SmallVector<MachineOperandWithLocation, 8> Operands;
+  if (Token.isRegister() || Token.isRegisterFlag()) {
+    auto Loc = Token.location();
     if (parseRegisterOperand(MO, /*IsDef=*/true))
       return true;
-    Operands.push_back(MO);
+    Operands.push_back(MachineOperandWithLocation(MO, Loc, Token.location()));
     if (Token.isNot(MIToken::equal))
       return error("expected '='");
     lex();
@@ -155,9 +178,10 @@ bool MIParser::parse(MachineInstr *&MI) {
 
   // Parse the remaining machine operands.
   while (Token.isNot(MIToken::Eof)) {
+    auto Loc = Token.location();
     if (parseMachineOperand(MO))
       return true;
-    Operands.push_back(MO);
+    Operands.push_back(MachineOperandWithLocation(MO, Loc, Token.location()));
     if (Token.is(MIToken::Eof))
       break;
     if (Token.isNot(MIToken::comma))
@@ -166,25 +190,16 @@ bool MIParser::parse(MachineInstr *&MI) {
   }
 
   const auto &MCID = MF.getSubtarget().getInstrInfo()->get(OpCode);
-
-  // Verify machine operands.
   if (!MCID.isVariadic()) {
-    for (size_t I = 0, E = Operands.size(); I < E; ++I) {
-      if (I < MCID.getNumOperands())
-        continue;
-      // Mark this register as implicit to prevent an assertion when it's added
-      // to an instruction. This is a temporary workaround until the implicit
-      // register flag can be parsed.
-      if (Operands[I].isReg())
-        Operands[I].setImplicit();
-    }
+    // FIXME: Move the implicit operand verification to the machine verifier.
+    if (verifyImplicitOperands(Operands, MCID))
+      return true;
   }
 
-  // TODO: Determine the implicit behaviour when implicit register flags are
-  // parsed.
+  // TODO: Check for extraneous machine operands.
   MI = MF.CreateMachineInstr(MCID, DebugLoc(), /*NoImplicit=*/true);
   for (const auto &Operand : Operands)
-    MI->addOperand(MF, Operand);
+    MI->addOperand(MF, Operand.Operand);
   return false;
 }
 
@@ -201,6 +216,80 @@ bool MIParser::parseMBB(MachineBasicBlock *&MBB) {
   return false;
 }
 
+bool MIParser::parseNamedRegister(unsigned &Reg) {
+  lex();
+  if (Token.isNot(MIToken::NamedRegister))
+    return error("expected a named register");
+  if (parseRegister(Reg))
+    return 0;
+  lex();
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the register reference");
+  return false;
+}
+
+static const char *printImplicitRegisterFlag(const MachineOperand &MO) {
+  assert(MO.isImplicit());
+  return MO.isDef() ? "implicit-def" : "implicit";
+}
+
+static std::string getRegisterName(const TargetRegisterInfo *TRI,
+                                   unsigned Reg) {
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) && "expected phys reg");
+  return StringRef(TRI->getName(Reg)).lower();
+}
+
+bool MIParser::verifyImplicitOperands(
+    ArrayRef<MachineOperandWithLocation> Operands, const MCInstrDesc &MCID) {
+  if (MCID.isCall())
+    // We can't verify call instructions as they can contain arbitrary implicit
+    // register and register mask operands.
+    return false;
+
+  // Gather all the expected implicit operands.
+  SmallVector<MachineOperand, 4> ImplicitOperands;
+  if (MCID.ImplicitDefs)
+    for (const uint16_t *ImpDefs = MCID.getImplicitDefs(); *ImpDefs; ++ImpDefs)
+      ImplicitOperands.push_back(
+          MachineOperand::CreateReg(*ImpDefs, true, true));
+  if (MCID.ImplicitUses)
+    for (const uint16_t *ImpUses = MCID.getImplicitUses(); *ImpUses; ++ImpUses)
+      ImplicitOperands.push_back(
+          MachineOperand::CreateReg(*ImpUses, false, true));
+
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  size_t I = ImplicitOperands.size(), J = Operands.size();
+  while (I) {
+    --I;
+    if (J) {
+      --J;
+      const auto &ImplicitOperand = ImplicitOperands[I];
+      const auto &Operand = Operands[J].Operand;
+      if (ImplicitOperand.isIdenticalTo(Operand))
+        continue;
+      if (Operand.isReg() && Operand.isImplicit()) {
+        return error(Operands[J].Begin,
+                     Twine("expected an implicit register operand '") +
+                         printImplicitRegisterFlag(ImplicitOperand) + " %" +
+                         getRegisterName(TRI, ImplicitOperand.getReg()) + "'");
+      }
+    }
+    // TODO: Fix source location when Operands[J].end is right before '=', i.e:
+    // insead of reporting an error at this location:
+    //            %eax = MOV32r0
+    //                 ^
+    // report the error at the following location:
+    //            %eax = MOV32r0
+    //                          ^
+    return error(J < Operands.size() ? Operands[J].End : Token.location(),
+                 Twine("missing implicit register operand '") +
+                     printImplicitRegisterFlag(ImplicitOperands[I]) + " %" +
+                     getRegisterName(TRI, ImplicitOperands[I].getReg()) + "'");
+  }
+  return false;
+}
+
 bool MIParser::parseInstruction(unsigned &OpCode) {
   if (Token.isNot(MIToken::Identifier))
     return error("expected a machine instruction");
@@ -222,6 +311,17 @@ bool MIParser::parseRegister(unsigned &Reg) {
       return error(Twine("unknown register name '") + Name + "'");
     break;
   }
+  case MIToken::VirtualRegister: {
+    unsigned ID;
+    if (getUnsigned(ID))
+      return true;
+    const auto RegInfo = PFS.VirtualRegisterSlots.find(ID);
+    if (RegInfo == PFS.VirtualRegisterSlots.end())
+      return error(Twine("use of undefined virtual register '%") + Twine(ID) +
+                   "'");
+    Reg = RegInfo->second;
+    break;
+  }
   // TODO: Parse other register kinds.
   default:
     llvm_unreachable("The current token should be a register");
@@ -229,14 +329,66 @@ bool MIParser::parseRegister(unsigned &Reg) {
   return false;
 }
 
+bool MIParser::parseRegisterFlag(unsigned &Flags) {
+  switch (Token.kind()) {
+  case MIToken::kw_implicit:
+    Flags |= RegState::Implicit;
+    break;
+  case MIToken::kw_implicit_define:
+    Flags |= RegState::ImplicitDefine;
+    break;
+  case MIToken::kw_dead:
+    Flags |= RegState::Dead;
+    break;
+  case MIToken::kw_killed:
+    Flags |= RegState::Kill;
+    break;
+  case MIToken::kw_undef:
+    Flags |= RegState::Undef;
+    break;
+  // TODO: report an error when we specify the same flag more than once.
+  // TODO: parse the other register flags.
+  default:
+    llvm_unreachable("The current token should be a register flag");
+  }
+  lex();
+  return false;
+}
+
+bool MIParser::parseSubRegisterIndex(unsigned &SubReg) {
+  assert(Token.is(MIToken::colon));
+  lex();
+  if (Token.isNot(MIToken::Identifier))
+    return error("expected a subregister index after ':'");
+  auto Name = Token.stringValue();
+  SubReg = getSubRegIndex(Name);
+  if (!SubReg)
+    return error(Twine("use of unknown subregister index '") + Name + "'");
+  lex();
+  return false;
+}
+
 bool MIParser::parseRegisterOperand(MachineOperand &Dest, bool IsDef) {
   unsigned Reg;
-  // TODO: Parse register flags.
+  unsigned Flags = IsDef ? RegState::Define : 0;
+  while (Token.isRegisterFlag()) {
+    if (parseRegisterFlag(Flags))
+      return true;
+  }
+  if (!Token.isRegister())
+    return error("expected a register after register flags");
   if (parseRegister(Reg))
     return true;
   lex();
-  // TODO: Parse subregister.
-  Dest = MachineOperand::CreateReg(Reg, IsDef);
+  unsigned SubReg = 0;
+  if (Token.is(MIToken::colon)) {
+    if (parseSubRegisterIndex(SubReg))
+      return true;
+  }
+  Dest = MachineOperand::CreateReg(
+      Reg, Flags & RegState::Define, Flags & RegState::Implicit,
+      Flags & RegState::Kill, Flags & RegState::Dead, Flags & RegState::Undef,
+      /*isEarlyClobber=*/false, SubReg);
   return false;
 }
 
@@ -266,8 +418,8 @@ bool MIParser::parseMBBReference(MachineBasicBlock *&MBB) {
   unsigned Number;
   if (getUnsigned(Number))
     return true;
-  auto MBBInfo = MBBSlots.find(Number);
-  if (MBBInfo == MBBSlots.end())
+  auto MBBInfo = PFS.MBBSlots.find(Number);
+  if (MBBInfo == PFS.MBBSlots.end())
     return error(Twine("use of undefined machine basic block #") +
                  Twine(Number));
   MBB = MBBInfo->second;
@@ -318,8 +470,14 @@ bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) {
 
 bool MIParser::parseMachineOperand(MachineOperand &Dest) {
   switch (Token.kind()) {
+  case MIToken::kw_implicit:
+  case MIToken::kw_implicit_define:
+  case MIToken::kw_dead:
+  case MIToken::kw_killed:
+  case MIToken::kw_undef:
   case MIToken::underscore:
   case MIToken::NamedRegister:
+  case MIToken::VirtualRegister:
     return parseRegisterOperand(Dest);
   case MIToken::IntegerLiteral:
     return parseImmediateOperand(Dest);
@@ -408,16 +566,41 @@ const uint32_t *MIParser::getRegMask(StringRef Identifier) {
   return RegMaskInfo->getValue();
 }
 
-bool llvm::parseMachineInstr(
-    MachineInstr *&MI, SourceMgr &SM, MachineFunction &MF, StringRef Src,
-    const DenseMap<unsigned, MachineBasicBlock *> &MBBSlots,
-    const SlotMapping &IRSlots, SMDiagnostic &Error) {
-  return MIParser(SM, MF, Error, Src, MBBSlots, IRSlots).parse(MI);
+void MIParser::initNames2SubRegIndices() {
+  if (!Names2SubRegIndices.empty())
+    return;
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  for (unsigned I = 1, E = TRI->getNumSubRegIndices(); I < E; ++I)
+    Names2SubRegIndices.insert(
+        std::make_pair(StringRef(TRI->getSubRegIndexName(I)).lower(), I));
+}
+
+unsigned MIParser::getSubRegIndex(StringRef Name) {
+  initNames2SubRegIndices();
+  auto SubRegInfo = Names2SubRegIndices.find(Name);
+  if (SubRegInfo == Names2SubRegIndices.end())
+    return 0;
+  return SubRegInfo->getValue();
+}
+
+bool llvm::parseMachineInstr(MachineInstr *&MI, SourceMgr &SM,
+                             MachineFunction &MF, StringRef Src,
+                             const PerFunctionMIParsingState &PFS,
+                             const SlotMapping &IRSlots, SMDiagnostic &Error) {
+  return MIParser(SM, MF, Error, Src, PFS, IRSlots).parse(MI);
+}
+
+bool llvm::parseMBBReference(MachineBasicBlock *&MBB, SourceMgr &SM,
+                             MachineFunction &MF, StringRef Src,
+                             const PerFunctionMIParsingState &PFS,
+                             const SlotMapping &IRSlots, SMDiagnostic &Error) {
+  return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseMBB(MBB);
 }
 
-bool llvm::parseMBBReference(
-    MachineBasicBlock *&MBB, SourceMgr &SM, MachineFunction &MF, StringRef Src,
-    const DenseMap<unsigned, MachineBasicBlock *> &MBBSlots,
-    const SlotMapping &IRSlots, SMDiagnostic &Error) {
-  return MIParser(SM, MF, Error, Src, MBBSlots, IRSlots).parseMBB(MBB);
+bool llvm::parseNamedRegisterReference(unsigned &Reg, SourceMgr &SM,
+                                       MachineFunction &MF, StringRef Src,
+                                       const PerFunctionMIParsingState &PFS,
+                                       const SlotMapping &IRSlots,
+                                       SMDiagnostic &Error) {
+  return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseNamedRegister(Reg);
 }
diff --git a/lib/CodeGen/MIRParser/MIParser.h b/lib/CodeGen/MIRParser/MIParser.h
index 4d6d4e700217..fca4c4e6f885 100644
--- a/lib/CodeGen/MIRParser/MIParser.h
+++ b/lib/CodeGen/MIRParser/MIParser.h
@@ -26,16 +26,26 @@ struct SlotMapping;
 class SMDiagnostic;
 class SourceMgr;
 
+struct PerFunctionMIParsingState {
+  DenseMap<unsigned, MachineBasicBlock *> MBBSlots;
+  DenseMap<unsigned, unsigned> VirtualRegisterSlots;
+};
+
 bool parseMachineInstr(MachineInstr *&MI, SourceMgr &SM, MachineFunction &MF,
-                       StringRef Src,
-                       const DenseMap<unsigned, MachineBasicBlock *> &MBBSlots,
+                       StringRef Src, const PerFunctionMIParsingState &PFS,
                        const SlotMapping &IRSlots, SMDiagnostic &Error);
 
 bool parseMBBReference(MachineBasicBlock *&MBB, SourceMgr &SM,
                        MachineFunction &MF, StringRef Src,
-                       const DenseMap<unsigned, MachineBasicBlock *> &MBBSlots,
+                       const PerFunctionMIParsingState &PFS,
                        const SlotMapping &IRSlots, SMDiagnostic &Error);
 
+bool parseNamedRegisterReference(unsigned &Reg, SourceMgr &SM,
+                                 MachineFunction &MF, StringRef Src,
+                                 const PerFunctionMIParsingState &PFS,
+                                 const SlotMapping &IRSlots,
+                                 SMDiagnostic &Error);
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/CodeGen/MIRParser/MIRParser.cpp b/lib/CodeGen/MIRParser/MIRParser.cpp
index 397458300782..16b0e1655891 100644
--- a/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -21,6 +21,7 @@
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/IR/BasicBlock.h"
@@ -48,6 +49,8 @@ class MIRParserImpl {
   LLVMContext &Context;
   StringMap<std::unique_ptr<yaml::MachineFunction>> Functions;
   SlotMapping IRSlots;
+  /// Maps from register class names to register classes.
+  StringMap<const TargetRegisterClass *> Names2RegClasses;
 
 public:
   MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, StringRef Filename,
@@ -60,6 +63,11 @@ public:
   /// Always returns true.
   bool error(const Twine &Message);
 
+  /// Report an error with the given message at the given location.
+  ///
+  /// Always returns true.
+  bool error(SMLoc Loc, const Twine &Message);
+
   /// Report a given error with the location translated from the location in an
   /// embedded string literal to a location in the MIR file.
   ///
@@ -90,13 +98,18 @@ public:
   /// Initialize the machine basic block using it's YAML representation.
   ///
   /// Return true if an error occurred.
-  bool initializeMachineBasicBlock(
-      MachineFunction &MF, MachineBasicBlock &MBB,
-      const yaml::MachineBasicBlock &YamlMBB,
-      const DenseMap<unsigned, MachineBasicBlock *> &MBBSlots);
+  bool initializeMachineBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB,
+                                   const yaml::MachineBasicBlock &YamlMBB,
+                                   const PerFunctionMIParsingState &PFS);
+
+  bool
+  initializeRegisterInfo(const MachineFunction &MF,
+                         MachineRegisterInfo &RegInfo,
+                         const yaml::MachineFunction &YamlMF,
+                         DenseMap<unsigned, unsigned> &VirtualRegisterSlots);
 
-  bool initializeRegisterInfo(MachineRegisterInfo &RegInfo,
-                              const yaml::MachineFunction &YamlMF);
+  bool initializeFrameInfo(MachineFrameInfo &MFI,
+                           const yaml::MachineFunction &YamlMF);
 
 private:
   /// Return a MIR diagnostic converted from an MI string diagnostic.
@@ -109,6 +122,14 @@ private:
 
   /// Create an empty function with the given name.
   void createDummyFunction(StringRef Name, Module &M);
+
+  void initNames2RegClasses(const MachineFunction &MF);
+
+  /// Check if the given identifier is a name of a register class.
+  ///
+  /// Return null if the name isn't a register class.
+  const TargetRegisterClass *getRegClass(const MachineFunction &MF,
+                                         StringRef Name);
 };
 
 } // end namespace llvm
@@ -125,6 +146,12 @@ bool MIRParserImpl::error(const Twine &Message) {
   return true;
 }
 
+bool MIRParserImpl::error(SMLoc Loc, const Twine &Message) {
+  Context.diagnose(DiagnosticInfoMIRParser(
+      DS_Error, SM.GetMessage(Loc, SourceMgr::DK_Error, Message)));
+  return true;
+}
+
 bool MIRParserImpl::error(const SMDiagnostic &Error, SMRange SourceRange) {
   assert(Error.getKind() == SourceMgr::DK_Error && "Expected an error");
   reportDiagnostic(diagFromMIStringDiag(Error, SourceRange));
@@ -233,34 +260,44 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
     MF.setAlignment(YamlMF.Alignment);
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
   MF.setHasInlineAsm(YamlMF.HasInlineAsm);
-  if (initializeRegisterInfo(MF.getRegInfo(), YamlMF))
+  PerFunctionMIParsingState PFS;
+  if (initializeRegisterInfo(MF, MF.getRegInfo(), YamlMF,
+                             PFS.VirtualRegisterSlots))
+    return true;
+  if (initializeFrameInfo(*MF.getFrameInfo(), YamlMF))
     return true;
 
   const auto &F = *MF.getFunction();
-  DenseMap<unsigned, MachineBasicBlock *> MBBSlots;
   for (const auto &YamlMBB : YamlMF.BasicBlocks) {
     const BasicBlock *BB = nullptr;
-    if (!YamlMBB.Name.empty()) {
+    const yaml::StringValue &Name = YamlMBB.Name;
+    if (!Name.Value.empty()) {
       BB = dyn_cast_or_null<BasicBlock>(
-          F.getValueSymbolTable().lookup(YamlMBB.Name));
+          F.getValueSymbolTable().lookup(Name.Value));
       if (!BB)
-        return error(Twine("basic block '") + YamlMBB.Name +
-                     "' is not defined in the function '" + MF.getName() + "'");
+        return error(Name.SourceRange.Start,
+                     Twine("basic block '") + Name.Value +
+                         "' is not defined in the function '" + MF.getName() +
+                         "'");
     }
     auto *MBB = MF.CreateMachineBasicBlock(BB);
     MF.insert(MF.end(), MBB);
-    bool WasInserted = MBBSlots.insert(std::make_pair(YamlMBB.ID, MBB)).second;
+    bool WasInserted =
+        PFS.MBBSlots.insert(std::make_pair(YamlMBB.ID, MBB)).second;
     if (!WasInserted)
       return error(Twine("redefinition of machine basic block with id #") +
                    Twine(YamlMBB.ID));
   }
 
+  if (YamlMF.BasicBlocks.empty())
+    return error(Twine("machine function '") + Twine(MF.getName()) +
+                 "' requires at least one machine basic block in its body");
   // Initialize the machine basic blocks after creating them all so that the
   // machine instructions parser can resolve the MBB references.
   unsigned I = 0;
   for (const auto &YamlMBB : YamlMF.BasicBlocks) {
     if (initializeMachineBasicBlock(MF, *MF.getBlockNumbered(I++), YamlMBB,
-                                    MBBSlots))
+                                    PFS))
       return true;
   }
   return false;
@@ -269,7 +306,7 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
 bool MIRParserImpl::initializeMachineBasicBlock(
     MachineFunction &MF, MachineBasicBlock &MBB,
     const yaml::MachineBasicBlock &YamlMBB,
-    const DenseMap<unsigned, MachineBasicBlock *> &MBBSlots) {
+    const PerFunctionMIParsingState &PFS) {
   MBB.setAlignment(YamlMBB.Alignment);
   if (YamlMBB.AddressTaken)
     MBB.setHasAddressTaken();
@@ -278,16 +315,24 @@ bool MIRParserImpl::initializeMachineBasicBlock(
   // Parse the successors.
   for (const auto &MBBSource : YamlMBB.Successors) {
     MachineBasicBlock *SuccMBB = nullptr;
-    if (parseMBBReference(SuccMBB, SM, MF, MBBSource.Value, MBBSlots, IRSlots,
+    if (parseMBBReference(SuccMBB, SM, MF, MBBSource.Value, PFS, IRSlots,
                           Error))
       return error(Error, MBBSource.SourceRange);
     // TODO: Report an error when adding the same successor more than once.
     MBB.addSuccessor(SuccMBB);
   }
+  // Parse the liveins.
+  for (const auto &LiveInSource : YamlMBB.LiveIns) {
+    unsigned Reg = 0;
+    if (parseNamedRegisterReference(Reg, SM, MF, LiveInSource.Value, PFS,
+                                    IRSlots, Error))
+      return error(Error, LiveInSource.SourceRange);
+    MBB.addLiveIn(Reg);
+  }
   // Parse the instructions.
   for (const auto &MISource : YamlMBB.Instructions) {
     MachineInstr *MI = nullptr;
-    if (parseMachineInstr(MI, SM, MF, MISource.Value, MBBSlots, IRSlots, Error))
+    if (parseMachineInstr(MI, SM, MF, MISource.Value, PFS, IRSlots, Error))
       return error(Error, MISource.SourceRange);
     MBB.insert(MBB.end(), MI);
   }
@@ -295,7 +340,9 @@ bool MIRParserImpl::initializeMachineBasicBlock(
 }
 
 bool MIRParserImpl::initializeRegisterInfo(
-    MachineRegisterInfo &RegInfo, const yaml::MachineFunction &YamlMF) {
+    const MachineFunction &MF, MachineRegisterInfo &RegInfo,
+    const yaml::MachineFunction &YamlMF,
+    DenseMap<unsigned, unsigned> &VirtualRegisterSlots) {
   assert(RegInfo.isSSA());
   if (!YamlMF.IsSSA)
     RegInfo.leaveSSA();
@@ -303,6 +350,67 @@ bool MIRParserImpl::initializeRegisterInfo(
   if (!YamlMF.TracksRegLiveness)
     RegInfo.invalidateLiveness();
   RegInfo.enableSubRegLiveness(YamlMF.TracksSubRegLiveness);
+
+  // Parse the virtual register information.
+  for (const auto &VReg : YamlMF.VirtualRegisters) {
+    const auto *RC = getRegClass(MF, VReg.Class.Value);
+    if (!RC)
+      return error(VReg.Class.SourceRange.Start,
+                   Twine("use of undefined register class '") +
+                       VReg.Class.Value + "'");
+    unsigned Reg = RegInfo.createVirtualRegister(RC);
+    // TODO: Report an error when the same virtual register with the same ID is
+    // redefined.
+    VirtualRegisterSlots.insert(std::make_pair(VReg.ID, Reg));
+  }
+  return false;
+}
+
+bool MIRParserImpl::initializeFrameInfo(MachineFrameInfo &MFI,
+                                        const yaml::MachineFunction &YamlMF) {
+  const yaml::MachineFrameInfo &YamlMFI = YamlMF.FrameInfo;
+  MFI.setFrameAddressIsTaken(YamlMFI.IsFrameAddressTaken);
+  MFI.setReturnAddressIsTaken(YamlMFI.IsReturnAddressTaken);
+  MFI.setHasStackMap(YamlMFI.HasStackMap);
+  MFI.setHasPatchPoint(YamlMFI.HasPatchPoint);
+  MFI.setStackSize(YamlMFI.StackSize);
+  MFI.setOffsetAdjustment(YamlMFI.OffsetAdjustment);
+  if (YamlMFI.MaxAlignment)
+    MFI.ensureMaxAlignment(YamlMFI.MaxAlignment);
+  MFI.setAdjustsStack(YamlMFI.AdjustsStack);
+  MFI.setHasCalls(YamlMFI.HasCalls);
+  MFI.setMaxCallFrameSize(YamlMFI.MaxCallFrameSize);
+  MFI.setHasOpaqueSPAdjustment(YamlMFI.HasOpaqueSPAdjustment);
+  MFI.setHasVAStart(YamlMFI.HasVAStart);
+  MFI.setHasMustTailInVarArgFunc(YamlMFI.HasMustTailInVarArgFunc);
+
+  // Initialize the fixed frame objects.
+  for (const auto &Object : YamlMF.FixedStackObjects) {
+    int ObjectIdx;
+    if (Object.Type != yaml::FixedMachineStackObject::SpillSlot)
+      ObjectIdx = MFI.CreateFixedObject(Object.Size, Object.Offset,
+                                        Object.IsImmutable, Object.IsAliased);
+    else
+      ObjectIdx = MFI.CreateFixedSpillStackObject(Object.Size, Object.Offset);
+    MFI.setObjectAlignment(ObjectIdx, Object.Alignment);
+    // TODO: Store the mapping between fixed object IDs and object indices to
+    // parse fixed stack object references correctly.
+  }
+
+  // Initialize the ordinary frame objects.
+  for (const auto &Object : YamlMF.StackObjects) {
+    int ObjectIdx;
+    if (Object.Type == yaml::MachineStackObject::VariableSized)
+      ObjectIdx =
+          MFI.CreateVariableSizedObject(Object.Alignment, /*Alloca=*/nullptr);
+    else
+      ObjectIdx = MFI.CreateStackObject(
+          Object.Size, Object.Alignment,
+          Object.Type == yaml::MachineStackObject::SpillSlot);
+    MFI.setObjectOffset(ObjectIdx, Object.Offset);
+    // TODO: Store the mapping between object IDs and object indices to parse
+    // stack object references correctly.
+  }
   return false;
 }
 
@@ -353,6 +461,26 @@ SMDiagnostic MIRParserImpl::diagFromLLVMAssemblyDiag(const SMDiagnostic &Error,
                       Error.getFixIts());
 }
 
+void MIRParserImpl::initNames2RegClasses(const MachineFunction &MF) {
+  if (!Names2RegClasses.empty())
+    return;
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; ++I) {
+    const auto *RC = TRI->getRegClass(I);
+    Names2RegClasses.insert(
+        std::make_pair(StringRef(TRI->getRegClassName(RC)).lower(), RC));
+  }
+}
+
+const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF,
+                                                      StringRef Name) {
+  initNames2RegClasses(MF);
+  auto RegClassInfo = Names2RegClasses.find(Name);
+  if (RegClassInfo == Names2RegClasses.end())
+    return nullptr;
+  return RegClassInfo->getValue();
+}
+
 MIRParser::MIRParser(std::unique_ptr<MIRParserImpl> Impl)
     : Impl(std::move(Impl)) {}
 
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index 76cbe2994c95..d5cf9244199e 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -15,10 +15,12 @@
 #include "MIRPrinter.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/YAMLTraits.h"
@@ -40,9 +42,13 @@ public:
 
   void print(const MachineFunction &MF);
 
-  void convert(yaml::MachineFunction &MF, const MachineRegisterInfo &RegInfo);
-  void convert(const Module &M, yaml::MachineBasicBlock &YamlMBB,
+  void convert(yaml::MachineFunction &MF, const MachineRegisterInfo &RegInfo,
+               const TargetRegisterInfo *TRI);
+  void convert(yaml::MachineFrameInfo &YamlMFI, const MachineFrameInfo &MFI);
+  void convert(ModuleSlotTracker &MST, yaml::MachineBasicBlock &YamlMBB,
                const MachineBasicBlock &MBB);
+  void convertStackObjects(yaml::MachineFunction &MF,
+                           const MachineFrameInfo &MFI);
 
 private:
   void initRegisterMaskIds(const MachineFunction &MF);
@@ -51,14 +57,14 @@ private:
 /// This class prints out the machine instructions using the MIR serialization
 /// format.
 class MIPrinter {
-  const Module &M;
   raw_ostream &OS;
+  ModuleSlotTracker &MST;
   const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds;
 
 public:
-  MIPrinter(const Module &M, raw_ostream &OS,
+  MIPrinter(raw_ostream &OS, ModuleSlotTracker &MST,
             const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds)
-      : M(M), OS(OS), RegisterMaskIds(RegisterMaskIds) {}
+      : OS(OS), MST(MST), RegisterMaskIds(RegisterMaskIds) {}
 
   void print(const MachineInstr &MI);
   void printMBBReference(const MachineBasicBlock &MBB);
@@ -84,6 +90,19 @@ template <> struct BlockScalarTraits<Module> {
 } // end namespace yaml
 } // end namespace llvm
 
+static void printReg(unsigned Reg, raw_ostream &OS,
+                     const TargetRegisterInfo *TRI) {
+  // TODO: Print Stack Slots.
+  if (!Reg)
+    OS << '_';
+  else if (TargetRegisterInfo::isVirtualRegister(Reg))
+    OS << '%' << TargetRegisterInfo::virtReg2Index(Reg);
+  else if (Reg < TRI->getNumRegs())
+    OS << '%' << StringRef(TRI->getName(Reg)).lower();
+  else
+    llvm_unreachable("Can't print this kind of register yet");
+}
+
 void MIRPrinter::print(const MachineFunction &MF) {
   initRegisterMaskIds(MF);
 
@@ -92,10 +111,12 @@ void MIRPrinter::print(const MachineFunction &MF) {
   YamlMF.Alignment = MF.getAlignment();
   YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
   YamlMF.HasInlineAsm = MF.hasInlineAsm();
-  convert(YamlMF, MF.getRegInfo());
+  convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo());
+  convert(YamlMF.FrameInfo, *MF.getFrameInfo());
+  convertStackObjects(YamlMF, *MF.getFrameInfo());
 
   int I = 0;
-  const auto &M = *MF.getFunction()->getParent();
+  ModuleSlotTracker MST(MF.getFunction()->getParent());
   for (const auto &MBB : MF) {
     // TODO: Allow printing of non sequentially numbered MBBs.
     // This is currently needed as the basic block references get their index
@@ -105,7 +126,7 @@ void MIRPrinter::print(const MachineFunction &MF) {
            "Can't print MBBs that aren't sequentially numbered");
     (void)I;
     yaml::MachineBasicBlock YamlMBB;
-    convert(M, YamlMBB, MBB);
+    convert(MST, YamlMBB, MBB);
     YamlMF.BasicBlocks.push_back(YamlMBB);
   }
   yaml::Output Out(OS);
@@ -113,37 +134,120 @@ void MIRPrinter::print(const MachineFunction &MF) {
 }
 
 void MIRPrinter::convert(yaml::MachineFunction &MF,
-                         const MachineRegisterInfo &RegInfo) {
+                         const MachineRegisterInfo &RegInfo,
+                         const TargetRegisterInfo *TRI) {
   MF.IsSSA = RegInfo.isSSA();
   MF.TracksRegLiveness = RegInfo.tracksLiveness();
   MF.TracksSubRegLiveness = RegInfo.subRegLivenessEnabled();
+
+  // Print the virtual register definitions.
+  for (unsigned I = 0, E = RegInfo.getNumVirtRegs(); I < E; ++I) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    yaml::VirtualRegisterDefinition VReg;
+    VReg.ID = I;
+    VReg.Class =
+        StringRef(TRI->getRegClassName(RegInfo.getRegClass(Reg))).lower();
+    MF.VirtualRegisters.push_back(VReg);
+  }
+}
+
+void MIRPrinter::convert(yaml::MachineFrameInfo &YamlMFI,
+                         const MachineFrameInfo &MFI) {
+  YamlMFI.IsFrameAddressTaken = MFI.isFrameAddressTaken();
+  YamlMFI.IsReturnAddressTaken = MFI.isReturnAddressTaken();
+  YamlMFI.HasStackMap = MFI.hasStackMap();
+  YamlMFI.HasPatchPoint = MFI.hasPatchPoint();
+  YamlMFI.StackSize = MFI.getStackSize();
+  YamlMFI.OffsetAdjustment = MFI.getOffsetAdjustment();
+  YamlMFI.MaxAlignment = MFI.getMaxAlignment();
+  YamlMFI.AdjustsStack = MFI.adjustsStack();
+  YamlMFI.HasCalls = MFI.hasCalls();
+  YamlMFI.MaxCallFrameSize = MFI.getMaxCallFrameSize();
+  YamlMFI.HasOpaqueSPAdjustment = MFI.hasOpaqueSPAdjustment();
+  YamlMFI.HasVAStart = MFI.hasVAStart();
+  YamlMFI.HasMustTailInVarArgFunc = MFI.hasMustTailInVarArgFunc();
 }
 
-void MIRPrinter::convert(const Module &M, yaml::MachineBasicBlock &YamlMBB,
+void MIRPrinter::convertStackObjects(yaml::MachineFunction &MF,
+                                     const MachineFrameInfo &MFI) {
+  // Process fixed stack objects.
+  unsigned ID = 0;
+  for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
+    if (MFI.isDeadObjectIndex(I))
+      continue;
+
+    yaml::FixedMachineStackObject YamlObject;
+    YamlObject.ID = ID++;
+    YamlObject.Type = MFI.isSpillSlotObjectIndex(I)
+                          ? yaml::FixedMachineStackObject::SpillSlot
+                          : yaml::FixedMachineStackObject::DefaultType;
+    YamlObject.Offset = MFI.getObjectOffset(I);
+    YamlObject.Size = MFI.getObjectSize(I);
+    YamlObject.Alignment = MFI.getObjectAlignment(I);
+    YamlObject.IsImmutable = MFI.isImmutableObjectIndex(I);
+    YamlObject.IsAliased = MFI.isAliasedObjectIndex(I);
+    MF.FixedStackObjects.push_back(YamlObject);
+    // TODO: Store the mapping between fixed object IDs and object indices to
+    // print the fixed stack object references correctly.
+  }
+
+  // Process ordinary stack objects.
+  ID = 0;
+  for (int I = 0, E = MFI.getObjectIndexEnd(); I < E; ++I) {
+    if (MFI.isDeadObjectIndex(I))
+      continue;
+
+    yaml::MachineStackObject YamlObject;
+    YamlObject.ID = ID++;
+    YamlObject.Type = MFI.isSpillSlotObjectIndex(I)
+                          ? yaml::MachineStackObject::SpillSlot
+                          : MFI.isVariableSizedObjectIndex(I)
+                                ? yaml::MachineStackObject::VariableSized
+                                : yaml::MachineStackObject::DefaultType;
+    YamlObject.Offset = MFI.getObjectOffset(I);
+    YamlObject.Size = MFI.getObjectSize(I);
+    YamlObject.Alignment = MFI.getObjectAlignment(I);
+
+    MF.StackObjects.push_back(YamlObject);
+    // TODO: Store the mapping between object IDs and object indices to print
+    // the stack object references correctly.
+  }
+}
+
+void MIRPrinter::convert(ModuleSlotTracker &MST,
+                         yaml::MachineBasicBlock &YamlMBB,
                          const MachineBasicBlock &MBB) {
   assert(MBB.getNumber() >= 0 && "Invalid MBB number");
   YamlMBB.ID = (unsigned)MBB.getNumber();
   // TODO: Serialize unnamed BB references.
   if (const auto *BB = MBB.getBasicBlock())
-    YamlMBB.Name = BB->hasName() ? BB->getName() : "<unnamed bb>";
+    YamlMBB.Name.Value = BB->hasName() ? BB->getName() : "<unnamed bb>";
   else
-    YamlMBB.Name = "";
+    YamlMBB.Name.Value = "";
   YamlMBB.Alignment = MBB.getAlignment();
   YamlMBB.AddressTaken = MBB.hasAddressTaken();
   YamlMBB.IsLandingPad = MBB.isLandingPad();
   for (const auto *SuccMBB : MBB.successors()) {
     std::string Str;
     raw_string_ostream StrOS(Str);
-    MIPrinter(M, StrOS, RegisterMaskIds).printMBBReference(*SuccMBB);
+    MIPrinter(StrOS, MST, RegisterMaskIds).printMBBReference(*SuccMBB);
     YamlMBB.Successors.push_back(StrOS.str());
   }
-
+  // Print the live in registers.
+  const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  for (auto I = MBB.livein_begin(), E = MBB.livein_end(); I != E; ++I) {
+    std::string Str;
+    raw_string_ostream StrOS(Str);
+    printReg(*I, StrOS, TRI);
+    YamlMBB.LiveIns.push_back(StrOS.str());
+  }
   // Print the machine instructions.
   YamlMBB.Instructions.reserve(MBB.size());
   std::string Str;
   for (const auto &MI : MBB) {
     raw_string_ostream StrOS(Str);
-    MIPrinter(M, StrOS, RegisterMaskIds).print(MI);
+    MIPrinter(StrOS, MST, RegisterMaskIds).print(MI);
     YamlMBB.Instructions.push_back(StrOS.str());
     Str.clear();
   }
@@ -188,18 +292,6 @@ void MIPrinter::print(const MachineInstr &MI) {
   }
 }
 
-static void printReg(unsigned Reg, raw_ostream &OS,
-                     const TargetRegisterInfo *TRI) {
-  // TODO: Print Stack Slots.
-  // TODO: Print virtual registers.
-  if (!Reg)
-    OS << '_';
-  else if (Reg < TRI->getNumRegs())
-    OS << '%' << StringRef(TRI->getName(Reg)).lower();
-  else
-    llvm_unreachable("Can't print this kind of register yet");
-}
-
 void MIPrinter::printMBBReference(const MachineBasicBlock &MBB) {
   OS << "%bb." << MBB.getNumber();
   if (const auto *BB = MBB.getBasicBlock()) {
@@ -211,9 +303,19 @@ void MIPrinter::printMBBReference(const MachineBasicBlock &MBB) {
 void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI) {
   switch (Op.getType()) {
   case MachineOperand::MO_Register:
-    // TODO: Print register flags.
+    // TODO: Print the other register flags.
+    if (Op.isImplicit())
+      OS << (Op.isDef() ? "implicit-def " : "implicit ");
+    if (Op.isDead())
+      OS << "dead ";
+    if (Op.isKill())
+      OS << "killed ";
+    if (Op.isUndef())
+      OS << "undef ";
     printReg(Op.getReg(), OS, TRI);
-    // TODO: Print sub register.
+    // Print the sub register.
+    if (Op.getSubReg() != 0)
+      OS << ':' << TRI->getSubRegIndexName(Op.getSubReg());
     break;
   case MachineOperand::MO_Immediate:
     OS << Op.getImm();
@@ -222,10 +324,7 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI) {
     printMBBReference(*Op.getMBB());
     break;
   case MachineOperand::MO_GlobalAddress:
-    // FIXME: Make this faster - print as operand will create a slot tracker to
-    // print unnamed values for the whole module every time it's called, which
-    // is inefficient.
-    Op.getGlobal()->printAsOperand(OS, /*PrintType=*/false, &M);
+    Op.getGlobal()->printAsOperand(OS, /*PrintType=*/false, MST);
     // TODO: Print offset and target flags.
     break;
   case MachineOperand::MO_RegisterMask: {
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index 467a2e4eb428..3f04bb0b532b 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -19,8 +19,8 @@
 using namespace llvm;
 
 namespace llvm {
-TEMPLATE_INSTANTIATION(class DomTreeNodeBase<MachineBasicBlock>);
-TEMPLATE_INSTANTIATION(class DominatorTreeBase<MachineBasicBlock>);
+template class DomTreeNodeBase<MachineBasicBlock>;
+template class DominatorTreeBase<MachineBasicBlock>;
 }
 
 char MachineDominatorTree::ID = 0;
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 800d1b5bd57d..9856e70edaef 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -29,6 +29,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -74,7 +75,7 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
   if (Fn->hasFnAttribute(Attribute::StackAlignment))
     FrameInfo->ensureMaxAlignment(Fn->getFnStackAlignment());
 
-  ConstantPool = new (Allocator) MachineConstantPool(TM);
+  ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
   Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
 
   // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
@@ -118,6 +119,10 @@ MachineFunction::~MachineFunction() {
   }
 }
 
+const DataLayout &MachineFunction::getDataLayout() const {
+  return Fn->getParent()->getDataLayout();
+}
+
 /// Get the JumpTableInfo for this function.
 /// If it does not already exist, allocate one.
 MachineJumpTableInfo *MachineFunction::
@@ -458,12 +463,12 @@ unsigned MachineFunction::addLiveIn(unsigned PReg,
 /// normal 'L' label is returned.
 MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
                                         bool isLinkerPrivate) const {
-  const DataLayout *DL = getTarget().getDataLayout();
+  const DataLayout &DL = getDataLayout();
   assert(JumpTableInfo && "No jump tables");
   assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!");
 
-  const char *Prefix = isLinkerPrivate ? DL->getLinkerPrivateGlobalPrefix() :
-                                         DL->getPrivateGlobalPrefix();
+  const char *Prefix = isLinkerPrivate ? DL.getLinkerPrivateGlobalPrefix()
+                                       : DL.getPrivateGlobalPrefix();
   SmallString<60> Name;
   raw_svector_ostream(Name)
     << Prefix << "JTI" << getFunctionNumber() << '_' << JTI;
@@ -472,9 +477,9 @@ MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
 
 /// Return a function-local symbol to represent the PIC base.
 MCSymbol *MachineFunction::getPICBaseSymbol() const {
-  const DataLayout *DL = getTarget().getDataLayout();
-  return Ctx.getOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+
-                               Twine(getFunctionNumber())+"$pb");
+  const DataLayout &DL = getDataLayout();
+  return Ctx.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                               Twine(getFunctionNumber()) + "$pb");
 }
 
 //===----------------------------------------------------------------------===//
@@ -790,10 +795,6 @@ void MachineJumpTableInfo::dump() const { print(dbgs()); }
 
 void MachineConstantPoolValue::anchor() { }
 
-const DataLayout *MachineConstantPool::getDataLayout() const {
-  return TM.getDataLayout();
-}
-
 Type *MachineConstantPoolEntry::getType() const {
   if (isMachineConstantPoolEntry())
     return Val.MachineCPVal->getType();
@@ -851,7 +852,7 @@ MachineConstantPool::~MachineConstantPool() {
 /// Test whether the given two constants can be allocated the same constant pool
 /// entry.
 static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
-                                      const DataLayout *TD) {
+                                      const DataLayout &DL) {
   // Handle the trivial case quickly.
   if (A == B) return true;
 
@@ -865,8 +866,8 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
     return false;
 
   // For now, only support constants with the same size.
-  uint64_t StoreSize = TD->getTypeStoreSize(A->getType());
-  if (StoreSize != TD->getTypeStoreSize(B->getType()) || StoreSize > 128)
+  uint64_t StoreSize = DL.getTypeStoreSize(A->getType());
+  if (StoreSize != DL.getTypeStoreSize(B->getType()) || StoreSize > 128)
     return false;
 
   Type *IntTy = IntegerType::get(A->getContext(), StoreSize*8);
@@ -877,16 +878,16 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
   // DataLayout.
   if (isa<PointerType>(A->getType()))
     A = ConstantFoldInstOperands(Instruction::PtrToInt, IntTy,
-                                 const_cast<Constant *>(A), *TD);
+                                 const_cast<Constant *>(A), DL);
   else if (A->getType() != IntTy)
     A = ConstantFoldInstOperands(Instruction::BitCast, IntTy,
-                                 const_cast<Constant *>(A), *TD);
+                                 const_cast<Constant *>(A), DL);
   if (isa<PointerType>(B->getType()))
     B = ConstantFoldInstOperands(Instruction::PtrToInt, IntTy,
-                                 const_cast<Constant *>(B), *TD);
+                                 const_cast<Constant *>(B), DL);
   else if (B->getType() != IntTy)
     B = ConstantFoldInstOperands(Instruction::BitCast, IntTy,
-                                 const_cast<Constant *>(B), *TD);
+                                 const_cast<Constant *>(B), DL);
 
   return A == B;
 }
@@ -903,8 +904,7 @@ unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C,
   // FIXME, this could be made much more efficient for large constant pools.
   for (unsigned i = 0, e = Constants.size(); i != e; ++i)
     if (!Constants[i].isMachineConstantPoolEntry() &&
-        CanShareConstantPoolEntry(Constants[i].Val.ConstVal, C,
-                                  getDataLayout())) {
+        CanShareConstantPoolEntry(Constants[i].Val.ConstVal, C, DL)) {
       if ((unsigned)Constants[i].getAlignment() < Alignment)
         Constants[i].Alignment = Alignment;
       return i;
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 42d0603ab96b..6a206249d834 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -320,7 +320,10 @@ void MachineModuleInfo::addPersonality(MachineBasicBlock *LandingPad,
                                        const Function *Personality) {
   LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
   LP.Personality = Personality;
+  addPersonality(Personality);
+}
 
+void MachineModuleInfo::addPersonality(const Function *Personality) {
   for (unsigned i = 0; i < Personalities.size(); ++i)
     if (Personalities[i] == Personality)
       return;
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 278a8f24d63e..5984af87a184 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -28,7 +29,6 @@ MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF)
     TracksSubRegLiveness(false) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
-  UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits());
   UsedPhysRegMask.resize(getTargetRegisterInfo()->getNumRegs());
 
   // Create the physreg use/def lists.
@@ -441,3 +441,49 @@ void MachineRegisterInfo::markUsesInDebugValueAsUndef(unsigned Reg) const {
       UseMI->getOperand(0).setReg(0U);
   }
 }
+
+static const Function *getCalledFunction(const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isGlobal())
+      continue;
+    const Function *Func = dyn_cast<Function>(MO.getGlobal());
+    if (Func != nullptr)
+      return Func;
+  }
+  return nullptr;
+}
+
+static bool isNoReturnDef(const MachineOperand &MO) {
+  // Anything which is not a noreturn function is a real def.
+  const MachineInstr &MI = *MO.getParent();
+  if (!MI.isCall())
+    return false;
+  const MachineBasicBlock &MBB = *MI.getParent();
+  if (!MBB.succ_empty())
+    return false;
+  const MachineFunction &MF = *MBB.getParent();
+  // We need to keep correct unwind information even if the function will
+  // not return, since the runtime may need it.
+  if (MF.getFunction()->hasFnAttribute(Attribute::UWTable))
+    return false;
+  const Function *Called = getCalledFunction(MI);
+  if (Called == nullptr || !Called->hasFnAttribute(Attribute::NoReturn)
+      || !Called->hasFnAttribute(Attribute::NoUnwind))
+    return false;
+
+  return true;
+}
+
+bool MachineRegisterInfo::isPhysRegModified(unsigned PhysReg) const {
+  if (UsedPhysRegMask.test(PhysReg))
+    return true;
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) {
+    for (const MachineOperand &MO : make_range(def_begin(*AI), def_end())) {
+      if (isNoReturnDef(MO))
+        continue;
+      return true;
+    }
+  }
+  return false;
+}
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index f9adba0b35c4..9404c687d410 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -509,18 +509,17 @@ MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
             << " height.\n");
       // Find any MBB predecessors that have MBB as their preferred successor.
       // They are the only ones that need to be invalidated.
-      for (MachineBasicBlock::const_pred_iterator
-           I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
-        TraceBlockInfo &TBI = BlockInfo[(*I)->getNumber()];
+      for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+        TraceBlockInfo &TBI = BlockInfo[Pred->getNumber()];
         if (!TBI.hasValidHeight())
           continue;
         if (TBI.Succ == MBB) {
           TBI.invalidateHeight();
-          WorkList.push_back(*I);
+          WorkList.push_back(Pred);
           continue;
         }
         // Verify that TBI.Succ is actually a *I successor.
-        assert((!TBI.Succ || (*I)->isSuccessor(TBI.Succ)) && "CFG changed");
+        assert((!TBI.Succ || Pred->isSuccessor(TBI.Succ)) && "CFG changed");
       }
     } while (!WorkList.empty());
   }
@@ -535,18 +534,17 @@ MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
             << " depth.\n");
       // Find any MBB successors that have MBB as their preferred predecessor.
       // They are the only ones that need to be invalidated.
-      for (MachineBasicBlock::const_succ_iterator
-           I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
-        TraceBlockInfo &TBI = BlockInfo[(*I)->getNumber()];
+      for (const MachineBasicBlock *Succ : MBB->successors()) {
+        TraceBlockInfo &TBI = BlockInfo[Succ->getNumber()];
         if (!TBI.hasValidDepth())
           continue;
         if (TBI.Pred == MBB) {
           TBI.invalidateDepth();
-          WorkList.push_back(*I);
+          WorkList.push_back(Succ);
           continue;
         }
         // Verify that TBI.Pred is actually a *I predecessor.
-        assert((!TBI.Pred || (*I)->isPredecessor(TBI.Pred)) && "CFG changed");
+        assert((!TBI.Pred || Succ->isPredecessor(TBI.Pred)) && "CFG changed");
       }
     } while (!WorkList.empty());
   }
@@ -998,8 +996,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
   // MBB is the highest precomputed block in the trace.
   if (MBB) {
     TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
-    for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
-      LiveInReg LI = TBI.LiveIns[i];
+    for (LiveInReg &LI : TBI.LiveIns) {
       if (TargetRegisterInfo::isVirtualRegister(LI.Reg)) {
         // For virtual registers, the def latency is included.
         unsigned &Height = Heights[MTM.MRI->getVRegDef(LI.Reg)];
@@ -1131,11 +1128,16 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
 
 MachineTraceMetrics::Trace
 MachineTraceMetrics::Ensemble::getTrace(const MachineBasicBlock *MBB) {
-  // FIXME: Check cache tags, recompute as needed.
-  computeTrace(MBB);
-  computeInstrDepths(MBB);
-  computeInstrHeights(MBB);
-  return Trace(*this, BlockInfo[MBB->getNumber()]);
+  TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+
+  if (!TBI.hasValidDepth() || !TBI.hasValidHeight())
+    computeTrace(MBB);
+  if (!TBI.HasValidInstrDepths)
+    computeInstrDepths(MBB);
+  if (!TBI.HasValidInstrHeights)
+    computeInstrHeights(MBB);
+  
+  return Trace(*this, TBI);
 }
 
 unsigned
@@ -1204,8 +1206,7 @@ unsigned MachineTraceMetrics::Trace::getResourceLength(
                             unsigned ResourceIdx)
                          ->unsigned {
     unsigned Cycles = 0;
-    for (unsigned I = 0; I != Instrs.size(); ++I) {
-      const MCSchedClassDesc *SC = Instrs[I];
+    for (const MCSchedClassDesc *SC : Instrs) {
       if (!SC->isValid())
         continue;
       for (TargetSchedModel::ProcResIter
@@ -1223,8 +1224,8 @@ unsigned MachineTraceMetrics::Trace::getResourceLength(
 
   for (unsigned K = 0; K != PRDepths.size(); ++K) {
     unsigned PRCycles = PRDepths[K] + PRHeights[K];
-    for (unsigned I = 0; I != Extrablocks.size(); ++I)
-      PRCycles += TE.MTM.getProcResourceCycles(Extrablocks[I]->getNumber())[K];
+    for (const MachineBasicBlock *MBB : Extrablocks)
+      PRCycles += TE.MTM.getProcResourceCycles(MBB->getNumber())[K];
     PRCycles += extraCycles(ExtraInstrs, K);
     PRCycles -= extraCycles(RemoveInstrs, K);
     PRMax = std::max(PRMax, PRCycles);
@@ -1235,8 +1236,8 @@ unsigned MachineTraceMetrics::Trace::getResourceLength(
   // Instrs: #instructions in current trace outside current block.
   unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight;
   // Add instruction count from the extra blocks.
-  for (unsigned i = 0, e = Extrablocks.size(); i != e; ++i)
-    Instrs += TE.MTM.getResources(Extrablocks[i])->InstrCount;
+  for (const MachineBasicBlock *MBB : Extrablocks)
+    Instrs += TE.MTM.getResources(MBB)->InstrCount;
   Instrs += ExtraInstrs.size();
   Instrs -= RemoveInstrs.size();
   if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 210a7a1649cd..024d166a4987 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -214,10 +214,10 @@ TargetPassConfig::~TargetPassConfig() {
 // Out of line constructor provides default values for pass options and
 // registers all common codegen passes.
 TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
-    : ImmutablePass(ID), PM(&pm), StartAfter(nullptr), StopAfter(nullptr),
-      Started(true), Stopped(false), AddingMachinePasses(false), TM(tm),
-      Impl(nullptr), Initialized(false), DisableVerify(false),
-      EnableTailMerge(true), EnableShrinkWrap(false) {
+    : ImmutablePass(ID), PM(&pm), StartBefore(nullptr), StartAfter(nullptr),
+      StopAfter(nullptr), Started(true), Stopped(false),
+      AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false),
+      DisableVerify(false), EnableTailMerge(true), EnableShrinkWrap(false) {
 
   Impl = new PassConfigImpl();
 
@@ -288,6 +288,8 @@ void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
   // and shouldn't reference it.
   AnalysisID PassID = P->getPassID();
 
+  if (StartBefore == PassID)
+    Started = true;
   if (Started && !Stopped) {
     std::string Banner;
     // Construct banner message before PM->add() as that may delete the pass.
@@ -422,7 +424,7 @@ void TargetPassConfig::addPassesToHandleExceptions() {
     // removed from the parent invoke(s). This could happen when a landing
     // pad is shared by multiple invokes and is also a target of a normal
     // edge from elsewhere.
-    addPass(createSjLjEHPreparePass(TM));
+    addPass(createSjLjEHPreparePass());
     // FALLTHROUGH
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 76583f0de888..b2fdee6c8e4c 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -82,7 +82,8 @@ private:
 
   void calculateSets(MachineFunction &Fn);
   void calculateCallsInformation(MachineFunction &Fn);
-  void calculateCalleeSavedRegisters(MachineFunction &Fn);
+  void assignCalleeSavedSpillSlots(MachineFunction &Fn,
+                                   const BitVector &SavedRegs);
   void insertCSRSpillsAndRestores(MachineFunction &Fn);
   void calculateFrameObjectOffsets(MachineFunction &Fn);
   void replaceFrameIndices(MachineFunction &Fn);
@@ -92,7 +93,7 @@ private:
   void insertPrologEpilogCode(MachineFunction &Fn);
 
   // Convenience for recognizing return blocks.
-  bool isReturnBlock(MachineBasicBlock *MBB);
+  bool isReturnBlock(const MachineBasicBlock *MBB) const;
 };
 } // namespace
 
@@ -127,7 +128,7 @@ void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-bool PEI::isReturnBlock(MachineBasicBlock* MBB) {
+bool PEI::isReturnBlock(const MachineBasicBlock* MBB) const {
   return (MBB && !MBB->empty() && MBB->back().isReturn());
 }
 
@@ -143,7 +144,12 @@ void PEI::calculateSets(MachineFunction &Fn) {
   if (MFI->getSavePoint()) {
     SaveBlock = MFI->getSavePoint();
     assert(MFI->getRestorePoint() && "Both restore and save must be set");
-    RestoreBlocks.push_back(MFI->getRestorePoint());
+    MachineBasicBlock *RestoreBlock = MFI->getRestorePoint();
+    // If RestoreBlock does not have any successor and is not a return block
+    // then the end point is unreachable and we do not need to insert any
+    // epilogue.
+    if (!RestoreBlock->succ_empty() || isReturnBlock(RestoreBlock))
+      RestoreBlocks.push_back(RestoreBlock);
     return;
   }
 
@@ -178,13 +184,12 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   // instructions.
   calculateCallsInformation(Fn);
 
-  // Allow the target machine to make some adjustments to the function
-  // e.g. UsedPhysRegs before calculateCalleeSavedRegisters.
-  TFI->processFunctionBeforeCalleeSavedScan(Fn, RS);
+  // Determine which of the registers in the callee save list should be saved.
+  BitVector SavedRegs;
+  TFI->determineCalleeSaves(Fn, SavedRegs, RS);
 
-  // Scan the function for modified callee saved registers and insert spill code
-  // for any callee saved registers that are modified.
-  calculateCalleeSavedRegisters(Fn);
+  // Insert spill code for any callee saved registers that are modified.
+  assignCalleeSavedSpillSlots(Fn, SavedRegs);
 
   // Determine placement of CSR spill/restore code:
   // place all spills in the entry block, all restores in return blocks.
@@ -290,39 +295,27 @@ void PEI::calculateCallsInformation(MachineFunction &Fn) {
   }
 }
 
-
-/// calculateCalleeSavedRegisters - Scan the function for modified callee saved
-/// registers.
-void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
-  const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
-  const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering();
-  MachineFrameInfo *MFI = F.getFrameInfo();
-
-  // Get the callee saved register list...
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F);
-
+void PEI::assignCalleeSavedSpillSlots(MachineFunction &F,
+                                      const BitVector &SavedRegs) {
   // These are used to keep track the callee-save area. Initialize them.
   MinCSFrameIndex = INT_MAX;
   MaxCSFrameIndex = 0;
 
-  // Early exit for targets which have no callee saved registers.
-  if (!CSRegs || CSRegs[0] == 0)
+  if (SavedRegs.empty())
     return;
 
-  // In Naked functions we aren't going to save any registers.
-  if (F.getFunction()->hasFnAttribute(Attribute::Naked))
-    return;
+  const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F);
 
   std::vector<CalleeSavedInfo> CSI;
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
-    // Functions which call __builtin_unwind_init get all their registers saved.
-    if (F.getRegInfo().isPhysRegUsed(Reg) || F.getMMI().callsUnwindInit()) {
-      // If the reg is modified, save it!
+    if (SavedRegs.test(Reg))
       CSI.push_back(CalleeSavedInfo(Reg));
-    }
   }
 
+  const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering();
+  MachineFrameInfo *MFI = F.getFrameInfo();
   if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI)) {
     // If target doesn't implement this, use generic code.
 
@@ -1033,12 +1026,8 @@ PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
           // Replace this reference to the virtual register with the
           // scratch register.
           assert (ScratchReg && "Missing scratch register!");
-          MachineRegisterInfo &MRI = Fn.getRegInfo();
           Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
           
-          // Make sure MRI now accounts this register as used.
-          MRI.setPhysRegUsed(ScratchReg);
-
           // Because this instruction was processed by the RS before this
           // register was allocated, make sure that the RS now records the
           // register as being used.
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index fd3d4d78968b..660bb4f0e34d 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -986,10 +986,6 @@ void RAFast::AllocateBasicBlock() {
       }
     }
 
-    for (UsedInInstrSet::iterator
-         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
-      MRI->setRegUnitUsed(*I);
-
     // Track registers defined by instruction - early clobbers and tied uses at
     // this point.
     UsedInInstr.clear();
@@ -1050,10 +1046,6 @@ void RAFast::AllocateBasicBlock() {
       killVirtReg(VirtDead[i]);
     VirtDead.clear();
 
-    for (UsedInInstrSet::iterator
-         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
-      MRI->setRegUnitUsed(*I);
-
     if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) {
       DEBUG(dbgs() << "-- coalescing: " << *MI);
       Coalesced.push_back(MI);
@@ -1103,12 +1095,6 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
     AllocateBasicBlock();
   }
 
-  // Add the clobber lists for all the instructions we skipped earlier.
-  for (const MCInstrDesc *Desc : SkippedInstrs)
-    if (const uint16_t *Defs = Desc->getImplicitDefs())
-      while (*Defs)
-        MRI->setPhysRegUsed(*Defs++);
-
   // All machine operands and other references to virtual registers have been
   // replaced. Remove the virtual registers.
   MRI->clearVirtRegs();
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 26f42c93323a..7ebcf7f54856 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -400,6 +400,8 @@ private:
   typedef SmallVector<HintInfo, 4> HintsInfo;
   BlockFrequency getBrokenHintFreq(const HintsInfo &, unsigned);
   void collectHintInfo(unsigned, HintsInfo &);
+
+  bool isUnusedCalleeSavedReg(unsigned PhysReg) const;
 };
 } // end anonymous namespace
 
@@ -816,6 +818,16 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
   }
 }
 
+/// Returns true if the given \p PhysReg is a callee saved register and has not
+/// been used for allocation yet.
+bool RAGreedy::isUnusedCalleeSavedReg(unsigned PhysReg) const {
+  unsigned CSR = RegClassInfo.getLastCalleeSavedAlias(PhysReg);
+  if (CSR == 0)
+    return false;
+
+  return !Matrix->isPhysRegUsed(PhysReg);
+}
+
 /// tryEvict - Try to evict all interferences for a physreg.
 /// @param  VirtReg Currently unassigned virtual register.
 /// @param  Order   Physregs to try.
@@ -861,13 +873,12 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
       continue;
     // The first use of a callee-saved register in a function has cost 1.
     // Don't start using a CSR when the CostPerUseLimit is low.
-    if (CostPerUseLimit == 1)
-     if (unsigned CSR = RegClassInfo.getLastCalleeSavedAlias(PhysReg))
-       if (!MRI->isPhysRegUsed(CSR)) {
-         DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " would clobber CSR "
-                      << PrintReg(CSR, TRI) << '\n');
-         continue;
-       }
+    if (CostPerUseLimit == 1 && isUnusedCalleeSavedReg(PhysReg)) {
+      DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " would clobber CSR "
+            << PrintReg(RegClassInfo.getLastCalleeSavedAlias(PhysReg), TRI)
+            << '\n');
+      continue;
+    }
 
     if (!canEvictInterference(VirtReg, PhysReg, false, BestCost))
       continue;
@@ -1348,9 +1359,8 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
   unsigned BestCand = NoCand;
   Order.rewind();
   while (unsigned PhysReg = Order.next()) {
-   if (unsigned CSR = RegClassInfo.getLastCalleeSavedAlias(PhysReg))
-     if (IgnoreCSR && !MRI->isPhysRegUsed(CSR))
-       continue;
+    if (IgnoreCSR && isUnusedCalleeSavedReg(PhysReg))
+      continue;
 
     // Discard bad candidates before we run out of interference cache cursors.
     // This will only affect register classes with a lot of registers (>32).
@@ -2134,7 +2144,8 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
       unsigned ItVirtReg = (*It)->reg;
       if (VRM->hasPhys(ItVirtReg))
         Matrix->unassign(**It);
-      Matrix->assign(**It, VirtRegToPhysReg[ItVirtReg]);
+      unsigned ItPhysReg = VirtRegToPhysReg[ItVirtReg];
+      Matrix->assign(**It, ItPhysReg);
     }
   }
 
@@ -2441,16 +2452,11 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   // First try assigning a free register.
   AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
   if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs)) {
-    // We check other options if we are using a CSR for the first time.
-    bool CSRFirstUse = false;
-    if (unsigned CSR = RegClassInfo.getLastCalleeSavedAlias(PhysReg))
-      if (!MRI->isPhysRegUsed(CSR))
-        CSRFirstUse = true;
-
     // When NewVRegs is not empty, we may have made decisions such as evicting
     // a virtual register, go with the earlier decisions and use the physical
     // register.
-    if (CSRCost.getFrequency() && CSRFirstUse && NewVRegs.empty()) {
+    if (CSRCost.getFrequency() && isUnusedCalleeSavedReg(PhysReg) &&
+        NewVRegs.empty()) {
       unsigned CSRReg = tryAssignCSRFirstTime(VirtReg, Order, PhysReg,
                                               CostPerUseLimit, NewVRegs);
       if (CSRReg || !NewVRegs.empty())
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 450a3051c6ff..c3786e552a13 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -77,6 +77,16 @@ void RegPressureTracker::dump() const {
   P.dump(TRI);
 }
 
+void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
+  for (const PressureChange &Change : *this) {
+    if (!Change.isValid() || Change.getUnitInc() == 0)
+      continue;
+    dbgs() << "    " << TRI.getRegPressureSetName(Change.getPSet())
+           << " " << Change.getUnitInc();
+  }
+  dbgs() << '\n';
+}
+
 /// Increase the current pressure as impacted by these registers and bump
 /// the high water mark if needed.
 void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) {
@@ -787,6 +797,8 @@ getMaxUpwardPressureDelta(const MachineInstr *MI, PressureDiff *PDiff,
   RegPressureDelta Delta2;
   getUpwardPressureDelta(MI, *PDiff, Delta2, CriticalPSets, MaxPressureLimit);
   if (Delta != Delta2) {
+    dbgs() << "PDiff: ";
+    PDiff->dump(*TRI);
     dbgs() << "DELTA: " << *MI;
     if (Delta.Excess.isValid())
       dbgs() << "Excess1 " << TRI->getRegPressureSetName(Delta.Excess.getPSet())
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6056d93ddc7a..52d620b1d540 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -443,8 +443,9 @@ namespace {
       assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
       if (LHSTy.isVector())
         return LHSTy;
-      return LegalTypes ? TLI.getScalarShiftAmountTy(LHSTy)
-                        : TLI.getPointerTy();
+      auto &DL = DAG.getDataLayout();
+      return LegalTypes ? TLI.getScalarShiftAmountTy(DL, LHSTy)
+                        : TLI.getPointerTy(DL);
     }
 
     /// This method returns true if we are running before type legalization or
@@ -456,7 +457,7 @@ namespace {
 
     /// Convenience wrapper around TargetLowering::getSetCCResultType
     EVT getSetCCResultType(EVT VT) const {
-      return TLI.getSetCCResultType(*DAG.getContext(), VT);
+      return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
     }
   };
 }
@@ -3111,7 +3112,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
           // For big endian targets, we need to add an offset to the pointer
           // to load the correct bytes.  For little endian systems, we merely
           // need to read fewer bytes from the same pointer.
-          if (TLI.isBigEndian()) {
+          if (DAG.getDataLayout().isBigEndian()) {
             unsigned LVTStoreBytes = LoadedVT.getStoreSize();
             unsigned EVTStoreBytes = ExtVT.getStoreSize();
             unsigned PtrOff = LVTStoreBytes - EVTStoreBytes;
@@ -6675,7 +6676,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
 
   // For big endian targets, we need to adjust the offset to the pointer to
   // load the correct bytes.
-  if (TLI.isBigEndian()) {
+  if (DAG.getDataLayout().isBigEndian()) {
     unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits();
     unsigned EVTStoreBits = ExtVT.getStoreSizeInBits();
     ShAmt = LVTStoreBits - EVTStoreBits - ShAmt;
@@ -6873,7 +6874,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) {
 SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
-  bool isLE = TLI.isLittleEndian();
+  bool isLE = DAG.getDataLayout().isLittleEndian();
 
   // noop truncate
   if (N0.getValueType() == N->getValueType(0))
@@ -6926,7 +6927,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     SDValue EltNo = N0->getOperand(1);
     if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
       int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-      EVT IndexTy = TLI.getVectorIdxTy();
+      EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
       int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
 
       SDValue V = DAG.getNode(ISD::BITCAST, SDLoc(N),
@@ -7093,8 +7094,8 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
       !LD2->isVolatile() &&
       DAG.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1)) {
     unsigned Align = LD1->getAlignment();
-    unsigned NewAlign = TLI.getDataLayout()->
-      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+    unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
+        VT.getTypeForEVT(*DAG.getContext()));
 
     if (NewAlign <= Align &&
         (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
@@ -7150,13 +7151,13 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       // Do not change the width of a volatile load.
       !cast<LoadSDNode>(N0)->isVolatile() &&
       // Do not remove the cast if the types differ in endian layout.
-      TLI.hasBigEndianPartOrdering(N0.getValueType()) ==
-      TLI.hasBigEndianPartOrdering(VT) &&
+      TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
+          TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
       (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
       TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    unsigned Align = TLI.getDataLayout()->
-      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+    unsigned Align = DAG.getDataLayout().getABITypeAlignment(
+        VT.getTypeForEVT(*DAG.getContext()));
     unsigned OrigAlign = LN0->getAlignment();
 
     if (Align <= OrigAlign) {
@@ -7368,7 +7369,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
     SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0, e = BV->getNumOperands(); i != e;
          i += NumInputsPerOutput) {
-      bool isLE = TLI.isLittleEndian();
+      bool isLE = DAG.getDataLayout().isLittleEndian();
       APInt NewBits = APInt(DstBitSize, 0);
       bool EltIsUndef = true;
       for (unsigned j = 0; j != NumInputsPerOutput; ++j) {
@@ -7415,7 +7416,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
     }
 
     // For big endian targets, swap the order of the pieces of each element.
-    if (TLI.isBigEndian())
+    if (DAG.getDataLayout().isBigEndian())
       std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
   }
 
@@ -8373,6 +8374,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
 
     if (TLI.combineRepeatedFPDivisors(Users.size())) {
       SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
+      // FIXME: This optimization requires some level of fast-math, so the
+      // created reciprocal node should at least have the 'allowReciprocal'
+      // fast-math-flag set.
       SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1);
 
       // Dividend / Divisor -> Dividend * Reciprocal
@@ -8381,10 +8385,14 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
         if (Dividend != FPOne) {
           SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
                                         Reciprocal);
-          DAG.ReplaceAllUsesWith(U, NewNode.getNode());
+          CombineTo(U, NewNode);
+        } else if (U != Reciprocal.getNode()) {
+          // In the absence of fast-math-flags, this user node is always the
+          // same node as Reciprocal, but with FMF they may be different nodes.
+          CombineTo(U, Reciprocal);
         }
       }
-      return SDValue();
+      return SDValue(N, 0);  // N was replaced.
     }
   }
 
@@ -8406,30 +8414,29 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitFSQRT(SDNode *N) {
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      !TLI.isFsqrtCheap()) {
-    // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
-    if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) {
-      EVT VT = RV.getValueType();
-      SDLoc DL(N);
-      RV = DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV);
-      AddToWorklist(RV.getNode());
+  if (!DAG.getTarget().Options.UnsafeFPMath || TLI.isFsqrtCheap())
+    return SDValue();
 
-      // Unfortunately, RV is now NaN if the input was exactly 0.
-      // Select out this case and force the answer to 0.
-      SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
-      SDValue ZeroCmp =
-        DAG.getSetCC(DL, TLI.getSetCCResultType(*DAG.getContext(), VT),
-                     N->getOperand(0), Zero, ISD::SETEQ);
-      AddToWorklist(ZeroCmp.getNode());
-      AddToWorklist(RV.getNode());
+  // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
+  SDValue RV = BuildRsqrtEstimate(N->getOperand(0));
+  if (!RV)
+    return SDValue();
+  
+  EVT VT = RV.getValueType();
+  SDLoc DL(N);
+  RV = DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV);
+  AddToWorklist(RV.getNode());
 
-      RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT,
-                       DL, VT, ZeroCmp, Zero, RV);
-      return RV;
-    }
-  }
-  return SDValue();
+  // Unfortunately, RV is now NaN if the input was exactly 0.
+  // Select out this case and force the answer to 0.
+  SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+  EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, N->getOperand(0), Zero, ISD::SETEQ);
+  AddToWorklist(ZeroCmp.getNode());
+  AddToWorklist(RV.getNode());
+
+  return DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
+                     ZeroCmp, Zero, RV);
 }
 
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
@@ -9144,7 +9151,8 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
   } else
     return false;
 
-  return TLI.isLegalAddressingMode(AM, VT.getTypeForEVT(*DAG.getContext()), AS);
+  return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
+                                   VT.getTypeForEVT(*DAG.getContext()), AS);
 }
 
 /// Try turning a load/store into a pre-indexed load/store when the base
@@ -9869,8 +9877,7 @@ struct LoadedSlice {
   /// \pre DAG != nullptr.
   uint64_t getOffsetFromBase() const {
     assert(DAG && "Missing context.");
-    bool IsBigEndian =
-        DAG->getTargetLoweringInfo().getDataLayout()->isBigEndian();
+    bool IsBigEndian = DAG->getDataLayout().isBigEndian();
     assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
     uint64_t Offset = Shift / 8;
     unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
@@ -9953,7 +9960,7 @@ struct LoadedSlice {
 
     // Check if it will be merged with the load.
     // 1. Check the alignment constraint.
-    unsigned RequiredAlignment = TLI.getDataLayout()->getABITypeAlignment(
+    unsigned RequiredAlignment = DAG->getDataLayout().getABITypeAlignment(
         ResVT.getTypeForEVT(*DAG->getContext()));
 
     if (RequiredAlignment > getAlignment())
@@ -10321,7 +10328,7 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
   unsigned StOffset;
   unsigned NewAlign = St->getAlignment();
 
-  if (DAG.getTargetLoweringInfo().isLittleEndian())
+  if (DAG.getDataLayout().isLittleEndian())
     StOffset = ByteShift;
   else
     StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes;
@@ -10434,12 +10441,12 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
       uint64_t PtrOff = ShAmt / 8;
       // For big endian targets, we need to adjust the offset to the pointer to
       // load the correct bytes.
-      if (TLI.isBigEndian())
+      if (DAG.getDataLayout().isBigEndian())
         PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
 
       unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
       Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext());
-      if (NewAlign < TLI.getDataLayout()->getABITypeAlignment(NewVTTy))
+      if (NewAlign < DAG.getDataLayout().getABITypeAlignment(NewVTTy))
         return SDValue();
 
       SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LD),
@@ -10503,7 +10510,7 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
     unsigned LDAlign = LD->getAlignment();
     unsigned STAlign = ST->getAlignment();
     Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlign = TLI.getDataLayout()->getABITypeAlignment(IntVTTy);
+    unsigned ABIAlign = DAG.getDataLayout().getABITypeAlignment(IntVTTy);
     if (LDAlign < ABIAlign || STAlign < ABIAlign)
       return SDValue();
 
@@ -10685,7 +10692,7 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
 
     // Construct a single integer constant which is made of the smaller
     // constant inputs.
-    bool IsLE = TLI.isLittleEndian();
+    bool IsLE = DAG.getDataLayout().isLittleEndian();
     for (unsigned i = 0; i < NumElem ; ++i) {
       unsigned Idx = IsLE ? (NumElem - 1 - i) : i;
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
@@ -10743,7 +10750,7 @@ static bool allowableAlignment(const SelectionDAG &DAG,
     return true;
 
   Type *Ty = EVTTy.getTypeForEVT(*DAG.getContext());
-  unsigned ABIAlignment = TLI.getDataLayout()->getPrefTypeAlignment(Ty);
+  unsigned ABIAlignment = DAG.getDataLayout().getPrefTypeAlignment(Ty);
   return (Align >= ABIAlignment);
 }
 
@@ -11205,8 +11212,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       ST->isUnindexed()) {
     unsigned OrigAlign = ST->getAlignment();
     EVT SVT = Value.getOperand(0).getValueType();
-    unsigned Align = TLI.getDataLayout()->
-      getABITypeAlignment(SVT.getTypeForEVT(*DAG.getContext()));
+    unsigned Align = DAG.getDataLayout().getABITypeAlignment(
+        SVT.getTypeForEVT(*DAG.getContext()));
     if (Align <= OrigAlign &&
         ((!LegalOperations && !ST->isVolatile()) ||
          TLI.isOperationLegalOrCustom(ISD::STORE, SVT)))
@@ -11265,7 +11272,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
           uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
           SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32);
           SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32);
-          if (TLI.isBigEndian()) std::swap(Lo, Hi);
+          if (DAG.getDataLayout().isBigEndian())
+            std::swap(Lo, Hi);
 
           unsigned Alignment = ST->getAlignment();
           bool isVolatile = ST->isVolatile();
@@ -11514,7 +11522,7 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
   EVT ResultVT = EVE->getValueType(0);
   EVT VecEltVT = InVecVT.getVectorElementType();
   unsigned Align = OriginalLoad->getAlignment();
-  unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
+  unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
       VecEltVT.getTypeForEVT(*DAG.getContext()));
 
   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
@@ -11648,7 +11656,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     // scalar_to_vector here as well.
 
     if (!LegalOperations) {
-      EVT IndexTy = TLI.getVectorIdxTy();
+      EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec,
                          DAG.getConstant(OrigElt, SDLoc(SVOp), IndexTy));
     }
@@ -11825,7 +11833,7 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
   if (!ValidTypes)
     return SDValue();
 
-  bool isLE = TLI.isLittleEndian();
+  bool isLE = DAG.getDataLayout().isLittleEndian();
   unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
   assert(ElemRatio > 1 && "Invalid element size ratio");
   SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
@@ -12079,10 +12087,13 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
 
         // Try to replace VecIn1 with two extract_subvectors
         // No need to update the masks, they should still be correct.
-        VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
-          DAG.getConstant(VT.getVectorNumElements(), dl, TLI.getVectorIdxTy()));
-        VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
-          DAG.getConstant(0, dl, TLI.getVectorIdxTy()));
+        VecIn2 = DAG.getNode(
+            ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
+            DAG.getConstant(VT.getVectorNumElements(), dl,
+                            TLI.getVectorIdxTy(DAG.getDataLayout())));
+        VecIn1 = DAG.getNode(
+            ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
+            DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       } else
         return SDValue();
     }
@@ -13354,12 +13365,13 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
           const_cast<ConstantFP*>(TV->getConstantFPValue())
         };
         Type *FPTy = Elts[0]->getType();
-        const DataLayout &TD = *TLI.getDataLayout();
+        const DataLayout &TD = DAG.getDataLayout();
 
         // Create a ConstantArray of the two constants.
         Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
-        SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(),
-                                            TD.getPrefTypeAlignment(FPTy));
+        SDValue CPIdx =
+            DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
+                                TD.getPrefTypeAlignment(FPTy));
         unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
 
         // Get the offsets to the 0 and 1 element of the array so that we can
@@ -13832,6 +13844,15 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
   // If they are both volatile then they cannot be reordered.
   if (Op0->isVolatile() && Op1->isVolatile()) return true;
 
+  // If one operation reads from invariant memory, and the other may store, they
+  // cannot alias. These should really be checking the equivalent of mayWrite,
+  // but it only matters for memory nodes other than load /store.
+  if (Op0->isInvariant() && Op1->writeMem())
+    return false;
+
+  if (Op1->isInvariant() && Op0->writeMem())
+    return false;
+
   // Gather base node and offset information.
   SDValue Base1, Base2;
   int64_t Offset1, Offset2;
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 5452b1721bb4..2b9ba2c1b534 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -166,7 +166,7 @@ bool FastISel::hasTrivialKill(const Value *V) {
 }
 
 unsigned FastISel::getRegForValue(const Value *V) {
-  EVT RealVT = TLI.getValueType(V->getType(), /*AllowUnknown=*/true);
+  EVT RealVT = TLI.getValueType(DL, V->getType(), /*AllowUnknown=*/true);
   // Don't handle non-simple values in FastISel.
   if (!RealVT.isSimple())
     return 0;
@@ -228,7 +228,7 @@ unsigned FastISel::materializeConstant(const Value *V, MVT VT) {
     if (!Reg) {
       // Try to emit the constant by using an integer constant with a cast.
       const APFloat &Flt = CF->getValueAPF();
-      EVT IntVT = TLI.getPointerTy();
+      EVT IntVT = TLI.getPointerTy(DL);
 
       uint64_t x[2];
       uint32_t IntBitWidth = IntVT.getSizeInBits();
@@ -321,7 +321,7 @@ std::pair<unsigned, bool> FastISel::getRegForGEPIndex(const Value *Idx) {
   bool IdxNIsKill = hasTrivialKill(Idx);
 
   // If the index is smaller or larger than intptr_t, truncate or extend it.
-  MVT PtrVT = TLI.getPointerTy();
+  MVT PtrVT = TLI.getPointerTy(DL);
   EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
   if (IdxVT.bitsLT(PtrVT)) {
     IdxN = fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND, IdxN,
@@ -493,7 +493,7 @@ bool FastISel::selectGetElementPtr(const User *I) {
   // FIXME: What's a good SWAG number for MaxOffs?
   uint64_t MaxOffs = 2048;
   Type *Ty = I->getOperand(0)->getType();
-  MVT VT = TLI.getPointerTy();
+  MVT VT = TLI.getPointerTy(DL);
   for (GetElementPtrInst::const_op_iterator OI = I->op_begin() + 1,
                                             E = I->op_end();
        OI != E; ++OI) {
@@ -908,10 +908,10 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
   // Handle the incoming return values from the call.
   CLI.clearIns();
   SmallVector<EVT, 4> RetTys;
-  ComputeValueVTs(TLI, CLI.RetTy, RetTys);
+  ComputeValueVTs(TLI, DL, CLI.RetTy, RetTys);
 
   SmallVector<ISD::OutputArg, 4> Outs;
-  GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, TLI);
+  GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, TLI, DL);
 
   bool CanLowerReturn = TLI.CanLowerReturn(
       CLI.CallConv, *FuncInfo.MF, CLI.IsVarArg, Outs, CLI.RetTy->getContext());
@@ -976,7 +976,7 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
       // not there, but there are cases it cannot get right.
       unsigned FrameAlign = Arg.Alignment;
       if (!FrameAlign)
-        FrameAlign = TLI.getByValTypeAlignment(ElementTy);
+        FrameAlign = TLI.getByValTypeAlignment(ElementTy, DL);
       Flags.setByValSize(FrameSize);
       Flags.setByValAlign(FrameAlign);
     }
@@ -1245,8 +1245,8 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
 }
 
 bool FastISel::selectCast(const User *I, unsigned Opcode) {
-  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
-  EVT DstVT = TLI.getValueType(I->getType());
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(DL, I->getType());
 
   if (SrcVT == MVT::Other || !SrcVT.isSimple() || DstVT == MVT::Other ||
       !DstVT.isSimple())
@@ -1288,8 +1288,8 @@ bool FastISel::selectBitCast(const User *I) {
   }
 
   // Bitcasts of other values become reg-reg copies or BITCAST operators.
-  EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType());
-  EVT DstEVT = TLI.getValueType(I->getType());
+  EVT SrcEVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT DstEVT = TLI.getValueType(DL, I->getType());
   if (SrcEVT == MVT::Other || DstEVT == MVT::Other ||
       !TLI.isTypeLegal(SrcEVT) || !TLI.isTypeLegal(DstEVT))
     // Unhandled type. Halt "fast" selection and bail.
@@ -1413,7 +1413,7 @@ bool FastISel::selectFNeg(const User *I) {
   bool OpRegIsKill = hasTrivialKill(I);
 
   // If the target has ISD::FNEG, use it.
-  EVT VT = TLI.getValueType(I->getType());
+  EVT VT = TLI.getValueType(DL, I->getType());
   unsigned ResultReg = fastEmit_r(VT.getSimpleVT(), VT.getSimpleVT(), ISD::FNEG,
                                   OpReg, OpRegIsKill);
   if (ResultReg) {
@@ -1456,7 +1456,7 @@ bool FastISel::selectExtractValue(const User *U) {
 
   // Make sure we only try to handle extracts with a legal result.  But also
   // allow i1 because it's easy.
-  EVT RealVT = TLI.getValueType(EVI->getType(), /*AllowUnknown=*/true);
+  EVT RealVT = TLI.getValueType(DL, EVI->getType(), /*AllowUnknown=*/true);
   if (!RealVT.isSimple())
     return false;
   MVT VT = RealVT.getSimpleVT();
@@ -1480,7 +1480,7 @@ bool FastISel::selectExtractValue(const User *U) {
   unsigned VTIndex = ComputeLinearIndex(AggTy, EVI->getIndices());
 
   SmallVector<EVT, 4> AggValueVTs;
-  ComputeValueVTs(TLI, AggTy, AggValueVTs);
+  ComputeValueVTs(TLI, DL, AggTy, AggValueVTs);
 
   for (unsigned i = 0; i < VTIndex; i++)
     ResultReg += TLI.getNumRegisters(FuncInfo.Fn->getContext(), AggValueVTs[i]);
@@ -1582,8 +1582,8 @@ bool FastISel::selectOperator(const User *I, unsigned Opcode) {
 
   case Instruction::IntToPtr: // Deliberate fall-through.
   case Instruction::PtrToInt: {
-    EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
-    EVT DstVT = TLI.getValueType(I->getType());
+    EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(DL, I->getType());
     if (DstVT.bitsGT(SrcVT))
       return selectCast(I, ISD::ZERO_EXTEND);
     if (DstVT.bitsLT(SrcVT))
@@ -1612,7 +1612,7 @@ FastISel::FastISel(FunctionLoweringInfo &FuncInfo,
                    bool SkipTargetIndependentISel)
     : FuncInfo(FuncInfo), MF(FuncInfo.MF), MRI(FuncInfo.MF->getRegInfo()),
       MFI(*FuncInfo.MF->getFrameInfo()), MCP(*FuncInfo.MF->getConstantPool()),
-      TM(FuncInfo.MF->getTarget()), DL(*TM.getDataLayout()),
+      TM(FuncInfo.MF->getTarget()), DL(MF->getDataLayout()),
       TII(*MF->getSubtarget().getInstrInfo()),
       TLI(*MF->getSubtarget().getTargetLowering()),
       TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo),
@@ -2037,7 +2037,7 @@ bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       // own moves. Second, this check is necessary because FastISel doesn't
       // use CreateRegs to create registers, so it always creates
       // exactly one register for each non-void instruction.
-      EVT VT = TLI.getValueType(PN->getType(), /*AllowUnknown=*/true);
+      EVT VT = TLI.getValueType(DL, PN->getType(), /*AllowUnknown=*/true);
       if (VT == MVT::Other || !TLI.isTypeLegal(VT)) {
         // Handle integer promotions, though, because they're common and easy.
         if (!(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)) {
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index ecaa2c972719..cc306cbf5ae4 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -90,7 +90,8 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
 
   // Check whether the function can return without sret-demotion.
   SmallVector<ISD::OutputArg, 4> Outs;
-  GetReturnInfo(Fn->getReturnType(), Fn->getAttributes(), Outs, *TLI);
+  GetReturnInfo(Fn->getReturnType(), Fn->getAttributes(), Outs, *TLI,
+                mf.getDataLayout());
   CanLowerReturn = TLI->CanLowerReturn(Fn->getCallingConv(), *MF,
                                        Fn->isVarArg(), Outs, Fn->getContext());
 
@@ -106,9 +107,9 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
         if (AI->isStaticAlloca()) {
           const ConstantInt *CUI = cast<ConstantInt>(AI->getArraySize());
           Type *Ty = AI->getAllocatedType();
-          uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(Ty);
+          uint64_t TySize = MF->getDataLayout().getTypeAllocSize(Ty);
           unsigned Align =
-              std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty),
+              std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment(Ty),
                        AI->getAlignment());
 
           TySize *= CUI->getZExtValue();   // Get total allocated size.
@@ -118,10 +119,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
             MF->getFrameInfo()->CreateStackObject(TySize, Align, false, AI);
 
         } else {
-          unsigned Align = std::max(
-              (unsigned)TLI->getDataLayout()->getPrefTypeAlignment(
-                AI->getAllocatedType()),
-              AI->getAlignment());
+          unsigned Align =
+              std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment(
+                           AI->getAllocatedType()),
+                       AI->getAlignment());
           unsigned StackAlign =
               MF->getSubtarget().getFrameLowering()->getStackAlignment();
           if (Align <= StackAlign)
@@ -138,7 +139,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
           unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
           const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
           std::vector<TargetLowering::AsmOperandInfo> Ops =
-              TLI->ParseConstraints(TRI, CS);
+              TLI->ParseConstraints(Fn->getParent()->getDataLayout(), TRI, CS);
           for (size_t I = 0, E = Ops.size(); I != E; ++I) {
             TargetLowering::AsmOperandInfo &Op = Ops[I];
             if (Op.Type == InlineAsm::isClobber) {
@@ -148,7 +149,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
                   TLI->getRegForInlineAsmConstraint(TRI, Op.ConstraintCode,
                                                     Op.ConstraintVT);
               if (PhysReg.first == SP)
-                MF->getFrameInfo()->setHasInlineAsmWithSPAdjust(true);
+                MF->getFrameInfo()->setHasOpaqueSPAdjustment(true);
             }
           }
         }
@@ -236,7 +237,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
       assert(PHIReg && "PHI node does not have an assigned virtual register!");
 
       SmallVector<EVT, 4> ValueVTs;
-      ComputeValueVTs(*TLI, PN->getType(), ValueVTs);
+      ComputeValueVTs(*TLI, MF->getDataLayout(), PN->getType(), ValueVTs);
       for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
         EVT VT = ValueVTs[vti];
         unsigned NumRegisters = TLI->getNumRegisters(Fn->getContext(), VT);
@@ -366,7 +367,7 @@ unsigned FunctionLoweringInfo::CreateRegs(Type *Ty) {
   const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
 
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(*TLI, Ty, ValueVTs);
+  ComputeValueVTs(*TLI, MF->getDataLayout(), Ty, ValueVTs);
 
   unsigned FirstReg = 0;
   for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
@@ -413,7 +414,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
     return;
 
   SmallVector<EVT, 1> ValueVTs;
-  ComputeValueVTs(*TLI, Ty, ValueVTs);
+  ComputeValueVTs(*TLI, MF->getDataLayout(), Ty, ValueVTs);
   assert(ValueVTs.size() == 1 &&
          "PHIs with non-vector integer types should have a single VT.");
   EVT IntVT = ValueVTs[0];
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 42595cb010c2..5ec10308dc28 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -406,10 +406,10 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
     Type *Type = CP->getType();
     // MachineConstantPool wants an explicit alignment.
     if (Align == 0) {
-      Align = MF->getTarget().getDataLayout()->getPrefTypeAlignment(Type);
+      Align = MF->getDataLayout().getPrefTypeAlignment(Type);
       if (Align == 0) {
         // Alignment of vector types.  FIXME!
-        Align = MF->getTarget().getDataLayout()->getTypeAllocSize(Type);
+        Align = MF->getDataLayout().getTypeAllocSize(Type);
       }
     }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index c0d7871bf08b..21ab07234c81 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -65,7 +65,7 @@ class SelectionDAGLegalize {
   SmallSetVector<SDNode *, 16> *UpdatedNodes;
 
   EVT getSetCCResultType(EVT VT) const {
-    return TLI.getSetCCResultType(*DAG.getContext(), VT);
+    return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   }
 
   // Libcall insertion helpers.
@@ -269,7 +269,8 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
     }
   }
 
-  SDValue CPIdx = DAG.getConstantPool(LLVMC, TLI.getPointerTy());
+  SDValue CPIdx =
+      DAG.getConstantPool(LLVMC, TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   if (Extend) {
     SDValue Result =
@@ -331,7 +332,8 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
     SDValue Store = DAG.getTruncStore(Chain, dl,
                                       Val, StackPtr, MachinePointerInfo(),
                                       StoredVT, false, false, 0);
-    SDValue Increment = DAG.getConstant(RegBytes, dl, TLI.getPointerTy(AS));
+    SDValue Increment = DAG.getConstant(
+        RegBytes, dl, TLI.getPointerTy(DAG.getDataLayout(), AS));
     SmallVector<SDValue, 8> Stores;
     unsigned Offset = 0;
 
@@ -385,24 +387,27 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
   int IncrementSize = NumBits / 8;
 
   // Divide the stored value in two parts.
-  SDValue ShiftAmount = DAG.getConstant(NumBits, dl,
-                                      TLI.getShiftAmountTy(Val.getValueType()));
+  SDValue ShiftAmount =
+      DAG.getConstant(NumBits, dl, TLI.getShiftAmountTy(Val.getValueType(),
+                                                        DAG.getDataLayout()));
   SDValue Lo = Val;
   SDValue Hi = DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount);
 
   // Store the two parts
   SDValue Store1, Store2;
-  Store1 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Lo:Hi, Ptr,
-                             ST->getPointerInfo(), NewStoredVT,
+  Store1 = DAG.getTruncStore(Chain, dl,
+                             DAG.getDataLayout().isLittleEndian() ? Lo : Hi,
+                             Ptr, ST->getPointerInfo(), NewStoredVT,
                              ST->isVolatile(), ST->isNonTemporal(), Alignment);
 
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getConstant(IncrementSize, dl, TLI.getPointerTy(AS)));
+                    DAG.getConstant(IncrementSize, dl,
+                                    TLI.getPointerTy(DAG.getDataLayout(), AS)));
   Alignment = MinAlign(Alignment, IncrementSize);
-  Store2 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Hi:Lo, Ptr,
-                             ST->getPointerInfo().getWithOffset(IncrementSize),
-                             NewStoredVT, ST->isVolatile(), ST->isNonTemporal(),
-                             Alignment, ST->getAAInfo());
+  Store2 = DAG.getTruncStore(
+      Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
+      ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT,
+      ST->isVolatile(), ST->isNonTemporal(), Alignment, ST->getAAInfo());
 
   SDValue Result =
     DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
@@ -448,7 +453,8 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
     // Make sure the stack slot is also aligned for the register type.
     SDValue StackBase = DAG.CreateStackTemporary(LoadedVT, RegVT);
 
-    SDValue Increment = DAG.getConstant(RegBytes, dl, TLI.getPointerTy());
+    SDValue Increment =
+        DAG.getConstant(RegBytes, dl, TLI.getPointerTy(DAG.getDataLayout()));
     SmallVector<SDValue, 8> Stores;
     SDValue StackPtr = StackBase;
     unsigned Offset = 0;
@@ -522,7 +528,7 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
 
   // Load the value in two parts
   SDValue Lo, Hi;
-  if (TLI.isLittleEndian()) {
+  if (DAG.getDataLayout().isLittleEndian()) {
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
                         NewLoadedVT, LD->isVolatile(),
                         LD->isNonTemporal(), LD->isInvariant(), Alignment,
@@ -549,8 +555,9 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
   }
 
   // aggregate the two parts
-  SDValue ShiftAmount = DAG.getConstant(NumBits, dl,
-                                       TLI.getShiftAmountTy(Hi.getValueType()));
+  SDValue ShiftAmount =
+      DAG.getConstant(NumBits, dl, TLI.getShiftAmountTy(Hi.getValueType(),
+                                                        DAG.getDataLayout()));
   SDValue Result = DAG.getNode(ISD::SHL, dl, VT, Hi, ShiftAmount);
   Result = DAG.getNode(ISD::OR, dl, VT, Result, Lo);
 
@@ -581,7 +588,7 @@ PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
   EVT VT    = Tmp1.getValueType();
   EVT EltVT = VT.getVectorElementType();
   EVT IdxVT = Tmp3.getValueType();
-  EVT PtrVT = TLI.getPointerTy();
+  EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   SDValue StackPtr = DAG.CreateStackTemporary(VT);
 
   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
@@ -677,7 +684,8 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         const APInt &IntVal = CFP->getValueAPF().bitcastToAPInt();
         SDValue Lo = DAG.getConstant(IntVal.trunc(32), dl, MVT::i32);
         SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), dl, MVT::i32);
-        if (TLI.isBigEndian()) std::swap(Lo, Hi);
+        if (DAG.getDataLayout().isBigEndian())
+          std::swap(Lo, Hi);
 
         Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), isVolatile,
                           isNonTemporal, Alignment, AAInfo);
@@ -724,7 +732,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           unsigned Align = ST->getAlignment();
           if (!TLI.allowsMisalignedMemoryAccesses(ST->getMemoryVT(), AS, Align)) {
             Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty);
+            unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
             if (Align < ABIAlignment)
               ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
           }
@@ -756,6 +764,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
 
       EVT StVT = ST->getMemoryVT();
       unsigned StWidth = StVT.getSizeInBits();
+      auto &DL = DAG.getDataLayout();
 
       if (StWidth != StVT.getStoreSizeInBits()) {
         // Promote to a byte-sized store with upper bits zero if not
@@ -782,7 +791,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
         SDValue Lo, Hi;
         unsigned IncrementSize;
 
-        if (TLI.isLittleEndian()) {
+        if (DL.isLittleEndian()) {
           // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
           // Store the bottom RoundWidth bits.
           Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
@@ -795,9 +804,10 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                             DAG.getConstant(IncrementSize, dl,
                                             Ptr.getValueType()));
-          Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
-                           DAG.getConstant(RoundWidth, dl,
-                                   TLI.getShiftAmountTy(Value.getValueType())));
+          Hi = DAG.getNode(
+              ISD::SRL, dl, Value.getValueType(), Value,
+              DAG.getConstant(RoundWidth, dl,
+                              TLI.getShiftAmountTy(Value.getValueType(), DL)));
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                                  ExtraVT, isVolatile, isNonTemporal,
@@ -806,9 +816,10 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           // Big endian - avoid unaligned stores.
           // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
           // Store the top RoundWidth bits.
-          Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
-                           DAG.getConstant(ExtraWidth, dl,
-                                   TLI.getShiftAmountTy(Value.getValueType())));
+          Hi = DAG.getNode(
+              ISD::SRL, dl, Value.getValueType(), Value,
+              DAG.getConstant(ExtraWidth, dl,
+                              TLI.getShiftAmountTy(Value.getValueType(), DL)));
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
                                  RoundVT, isVolatile, isNonTemporal, Alignment,
                                  AAInfo);
@@ -838,7 +849,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           // expand it.
           if (!TLI.allowsMisalignedMemoryAccesses(ST->getMemoryVT(), AS, Align)) {
             Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty);
+            unsigned ABIAlignment = DL.getABITypeAlignment(Ty);
             if (Align < ABIAlignment)
               ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
           }
@@ -890,8 +901,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // expand it.
       if (!TLI.allowsMisalignedMemoryAccesses(LD->getMemoryVT(), AS, Align)) {
         Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-        unsigned ABIAlignment =
-          TLI.getDataLayout()->getABITypeAlignment(Ty);
+        unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
         if (Align < ABIAlignment){
           ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, RVal, RChain);
         }
@@ -995,8 +1005,9 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
     SDValue Lo, Hi, Ch;
     unsigned IncrementSize;
+    auto &DL = DAG.getDataLayout();
 
-    if (TLI.isLittleEndian()) {
+    if (DL.isLittleEndian()) {
       // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
       // Load the bottom RoundWidth bits.
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
@@ -1020,9 +1031,10 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
                        Hi.getValue(1));
 
       // Move the top bits to the right place.
-      Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
-                       DAG.getConstant(RoundWidth, dl,
-                                      TLI.getShiftAmountTy(Hi.getValueType())));
+      Hi = DAG.getNode(
+          ISD::SHL, dl, Hi.getValueType(), Hi,
+          DAG.getConstant(RoundWidth, dl,
+                          TLI.getShiftAmountTy(Hi.getValueType(), DL)));
 
       // Join the hi and lo parts.
       Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
@@ -1051,9 +1063,10 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
                        Hi.getValue(1));
 
       // Move the top bits to the right place.
-      Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
-                       DAG.getConstant(ExtraWidth, dl,
-                                      TLI.getShiftAmountTy(Hi.getValueType())));
+      Hi = DAG.getNode(
+          ISD::SHL, dl, Hi.getValueType(), Hi,
+          DAG.getConstant(ExtraWidth, dl,
+                          TLI.getShiftAmountTy(Hi.getValueType(), DL)));
 
       // Join the hi and lo parts.
       Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
@@ -1086,7 +1099,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
         unsigned Align = LD->getAlignment();
         if (!TLI.allowsMisalignedMemoryAccesses(MemVT, AS, Align)) {
           Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-          unsigned ABIAlignment = TLI.getDataLayout()->getABITypeAlignment(Ty);
+          unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
           if (Align < ABIAlignment){
             ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, Value, Chain);
           }
@@ -1439,7 +1452,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
                     DAG.getConstant(EltSize, SDLoc(Vec), Idx.getValueType()));
 
-  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy());
+  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy(DAG.getDataLayout()));
   StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr);
 
   SDValue NewLoad;
@@ -1491,7 +1504,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
 
   Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
                     DAG.getConstant(EltSize, SDLoc(Vec), Idx.getValueType()));
-  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy());
+  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy(DAG.getDataLayout()));
 
   SDValue SubStackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
                                     StackPtr);
@@ -1569,15 +1582,16 @@ SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) {
     // Convert to an integer with the same sign bit.
     SignBit = DAG.getNode(ISD::BITCAST, dl, IVT, Tmp2);
   } else {
+    auto &DL = DAG.getDataLayout();
     // Store the float to memory, then load the sign part out as an integer.
-    MVT LoadTy = TLI.getPointerTy();
+    MVT LoadTy = TLI.getPointerTy(DL);
     // First create a temporary that is aligned for both the load and store.
     SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy);
     // Then store the float to it.
     SDValue Ch =
       DAG.getStore(DAG.getEntryNode(), dl, Tmp2, StackPtr, MachinePointerInfo(),
                    false, false, 0);
-    if (TLI.isBigEndian()) {
+    if (DL.isBigEndian()) {
       assert(FloatVT.isByteSized() && "Unsupported floating point type!");
       // Load out a legal integer with the same sign bit as the float.
       SignBit = DAG.getLoad(LoadTy, dl, Ch, StackPtr, MachinePointerInfo(),
@@ -1599,9 +1613,10 @@ SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) {
         (FloatVT.getSizeInBits() - 8 * ByteOffset);
       assert(BitShift < LoadTy.getSizeInBits() && "Pointer advanced wrong?");
       if (BitShift)
-        SignBit = DAG.getNode(ISD::SHL, dl, LoadTy, SignBit,
-                              DAG.getConstant(BitShift, dl,
-                                 TLI.getShiftAmountTy(SignBit.getValueType())));
+        SignBit = DAG.getNode(
+            ISD::SHL, dl, LoadTy, SignBit,
+            DAG.getConstant(BitShift, dl,
+                            TLI.getShiftAmountTy(SignBit.getValueType(), DL)));
     }
   }
   // Now get the sign bit proper, by seeing whether the value is negative.
@@ -1777,9 +1792,8 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
                                                EVT DestVT,
                                                SDLoc dl) {
   // Create the stack frame object.
-  unsigned SrcAlign =
-    TLI.getDataLayout()->getPrefTypeAlignment(SrcOp.getValueType().
-                                              getTypeForEVT(*DAG.getContext()));
+  unsigned SrcAlign = DAG.getDataLayout().getPrefTypeAlignment(
+      SrcOp.getValueType().getTypeForEVT(*DAG.getContext()));
   SDValue FIPtr = DAG.CreateStackTemporary(SlotVT, SrcAlign);
 
   FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(FIPtr);
@@ -1790,7 +1804,7 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
   unsigned SlotSize = SlotVT.getSizeInBits();
   unsigned DestSize = DestVT.getSizeInBits();
   Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
-  unsigned DestAlign = TLI.getDataLayout()->getPrefTypeAlignment(DestType);
+  unsigned DestAlign = DAG.getDataLayout().getPrefTypeAlignment(DestType);
 
   // Emit a store to the stack slot.  Use a truncstore if the input value is
   // later than DestVT.
@@ -1994,7 +2008,8 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
       }
     }
     Constant *CP = ConstantVector::get(CV);
-    SDValue CPIdx = DAG.getConstantPool(CP, TLI.getPointerTy());
+    SDValue CPIdx =
+        DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout()));
     unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
     return DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                        MachinePointerInfo::getConstantPool(),
@@ -2058,7 +2073,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
-                                         TLI.getPointerTy());
+                                         TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
 
@@ -2106,7 +2121,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
-                                         TLI.getPointerTy());
+                                         TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
 
@@ -2140,7 +2155,7 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
-                                         TLI.getPointerTy());
+                                         TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
 
@@ -2277,7 +2292,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   Args.push_back(Entry);
 
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
-                                         TLI.getPointerTy());
+                                         TLI.getPointerTy(DAG.getDataLayout()));
 
   SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -2389,7 +2404,7 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
   Args.push_back(Entry);
 
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
-                                         TLI.getPointerTy());
+                                         TLI.getPointerTy(DAG.getDataLayout()));
 
   SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -2426,7 +2441,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
     SDValue Hi = StackSlot;
     SDValue Lo = DAG.getNode(ISD::ADD, dl, StackSlot.getValueType(),
                              StackSlot, WordOff);
-    if (TLI.isLittleEndian())
+    if (DAG.getDataLayout().isLittleEndian())
       std::swap(Hi, Lo);
 
     // if signed map to unsigned space
@@ -2509,8 +2524,8 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
     if (!isSigned) {
       SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Op0);
 
-      SDValue ShiftConst =
-          DAG.getConstant(1, dl, TLI.getShiftAmountTy(Op0.getValueType()));
+      SDValue ShiftConst = DAG.getConstant(
+          1, dl, TLI.getShiftAmountTy(Op0.getValueType(), DAG.getDataLayout()));
       SDValue Shr = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0, ShiftConst);
       SDValue AndConst = DAG.getConstant(1, dl, MVT::i64);
       SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, AndConst);
@@ -2545,7 +2560,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
                                               MVT::i64),
                               ISD::SETUGE);
     SDValue Sel2 = DAG.getSelect(dl, MVT::i64, Ge, Sel, Op0);
-    EVT SHVT = TLI.getShiftAmountTy(Sel2.getValueType());
+    EVT SHVT = TLI.getShiftAmountTy(Sel2.getValueType(), DAG.getDataLayout());
 
     SDValue Sh = DAG.getNode(ISD::SRL, dl, MVT::i64, Sel2,
                              DAG.getConstant(32, dl, SHVT));
@@ -2584,11 +2599,13 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
   case MVT::i32: FF = 0x4F800000ULL; break;  // 2^32 (as a float)
   case MVT::i64: FF = 0x5F800000ULL; break;  // 2^64 (as a float)
   }
-  if (TLI.isLittleEndian()) FF <<= 32;
+  if (DAG.getDataLayout().isLittleEndian())
+    FF <<= 32;
   Constant *FudgeFactor = ConstantInt::get(
                                        Type::getInt64Ty(*DAG.getContext()), FF);
 
-  SDValue CPIdx = DAG.getConstantPool(FudgeFactor, TLI.getPointerTy());
+  SDValue CPIdx =
+      DAG.getConstantPool(FudgeFactor, TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   CPIdx = DAG.getNode(ISD::ADD, dl, CPIdx.getValueType(), CPIdx, CstOffset);
   Alignment = std::min(Alignment, 4u);
@@ -2699,7 +2716,7 @@ SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
 /// Open code the operations for BSWAP of the specified operation.
 SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, SDLoc dl) {
   EVT VT = Op.getValueType();
-  EVT SHVT = TLI.getShiftAmountTy(VT);
+  EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
   SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8;
   switch (VT.getSimpleVT().SimpleTy) {
   default: llvm_unreachable("Unhandled Expand type in BSWAP!");
@@ -2756,7 +2773,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
   default: llvm_unreachable("Cannot expand this yet!");
   case ISD::CTPOP: {
     EVT VT = Op.getValueType();
-    EVT ShVT = TLI.getShiftAmountTy(VT);
+    EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
     unsigned Len = VT.getSizeInBits();
 
     assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 &&
@@ -2814,7 +2831,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     //
     // Ref: "Hacker's Delight" by Henry Warren
     EVT VT = Op.getValueType();
-    EVT ShVT = TLI.getShiftAmountTy(VT);
+    EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
     unsigned len = VT.getSizeInBits();
     for (unsigned i = 0; (1U << i) <= (len / 2); ++i) {
       SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT);
@@ -2903,10 +2920,12 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     TargetLowering::ArgListTy Args;
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(dl).setChain(Node->getOperand(0))
-      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol("__sync_synchronize",
-                 TLI.getPointerTy()), std::move(Args), 0);
+    CLI.setDebugLoc(dl)
+        .setChain(Node->getOperand(0))
+        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol("__sync_synchronize",
+                                         TLI.getPointerTy(DAG.getDataLayout())),
+                   std::move(Args), 0);
 
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
@@ -3002,10 +3021,12 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // If this operation is not supported, lower it to 'abort()' call
     TargetLowering::ArgListTy Args;
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(dl).setChain(Node->getOperand(0))
-      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol("abort", TLI.getPointerTy()),
-                 std::move(Args), 0);
+    CLI.setDebugLoc(dl)
+        .setChain(Node->getOperand(0))
+        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol("abort",
+                                         TLI.getPointerTy(DAG.getDataLayout())),
+                   std::move(Args), 0);
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
     Results.push_back(CallResult.second);
@@ -3028,7 +3049,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // SAR.  However, it is doubtful that any exist.
     EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
     EVT VT = Node->getValueType(0);
-    EVT ShiftAmountTy = TLI.getShiftAmountTy(VT);
+    EVT ShiftAmountTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
     if (VT.isVector())
       ShiftAmountTy = VT;
     unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
@@ -3092,9 +3113,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp2 = Node->getOperand(1);
     unsigned Align = Node->getConstantOperandVal(3);
 
-    SDValue VAListLoad = DAG.getLoad(TLI.getPointerTy(), dl, Tmp1, Tmp2,
-                                     MachinePointerInfo(V),
-                                     false, false, false, 0);
+    SDValue VAListLoad =
+        DAG.getLoad(TLI.getPointerTy(DAG.getDataLayout()), dl, Tmp1, Tmp2,
+                    MachinePointerInfo(V), false, false, false, 0);
     SDValue VAList = VAListLoad;
 
     if (Align > TLI.getMinStackArgumentAlignment()) {
@@ -3111,10 +3132,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
 
     // Increment the pointer, VAList, to the next vaarg
     Tmp3 = DAG.getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
-                       DAG.getConstant(TLI.getDataLayout()->
-                          getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext())),
-                                       dl,
-                                       VAList.getValueType()));
+                       DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(
+                                           VT.getTypeForEVT(*DAG.getContext())),
+                                       dl, VAList.getValueType()));
     // Store the incremented VAList to the legalized pointer
     Tmp3 = DAG.getStore(VAListLoad.getValue(1), dl, Tmp3, Tmp2,
                         MachinePointerInfo(V), false, false, 0);
@@ -3129,9 +3149,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // output, returning the chain.
     const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
     const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
-    Tmp1 = DAG.getLoad(TLI.getPointerTy(), dl, Node->getOperand(0),
-                       Node->getOperand(2), MachinePointerInfo(VS),
-                       false, false, false, 0);
+    Tmp1 = DAG.getLoad(TLI.getPointerTy(DAG.getDataLayout()), dl,
+                       Node->getOperand(0), Node->getOperand(2),
+                       MachinePointerInfo(VS), false, false, false, 0);
     Tmp1 = DAG.getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
                         MachinePointerInfo(VD), false, false, 0);
     Results.push_back(Tmp1);
@@ -3226,14 +3246,14 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       }
       unsigned Idx = Mask[i];
       if (Idx < NumElems)
-        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
-                                  Op0,
-                                  DAG.getConstant(Idx, dl, TLI.getVectorIdxTy())));
+        Ops.push_back(DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
       else
-        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
-                                  Op1,
-                                  DAG.getConstant(Idx - NumElems, dl,
-                                                  TLI.getVectorIdxTy())));
+        Ops.push_back(DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op1,
+            DAG.getConstant(Idx - NumElems, dl,
+                            TLI.getVectorIdxTy(DAG.getDataLayout()))));
     }
 
     Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
@@ -3247,8 +3267,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
       // 1 -> Hi
       Tmp1 = DAG.getNode(ISD::SRL, dl, OpTy, Node->getOperand(0),
-                         DAG.getConstant(OpTy.getSizeInBits()/2, dl,
-                    TLI.getShiftAmountTy(Node->getOperand(0).getValueType())));
+                         DAG.getConstant(OpTy.getSizeInBits() / 2, dl,
+                                         TLI.getShiftAmountTy(
+                                             Node->getOperand(0).getValueType(),
+                                             DAG.getDataLayout())));
       Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Tmp1);
     } else {
       // 0 -> Lo
@@ -3646,8 +3668,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
         TLI.expandMUL(Node, Lo, Hi, HalfType, DAG)) {
       Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
       Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Hi);
-      SDValue Shift = DAG.getConstant(HalfType.getSizeInBits(), dl,
-                                      TLI.getShiftAmountTy(HalfType));
+      SDValue Shift =
+          DAG.getConstant(HalfType.getSizeInBits(), dl,
+                          TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
       Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
       Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
       break;
@@ -3759,12 +3782,14 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       // The high part is obtained by SRA'ing all but one of the bits of low
       // part.
       unsigned LoSize = VT.getSizeInBits();
-      SDValue HiLHS = DAG.getNode(ISD::SRA, dl, VT, RHS,
-                                  DAG.getConstant(LoSize - 1, dl,
-                                                  TLI.getPointerTy()));
-      SDValue HiRHS = DAG.getNode(ISD::SRA, dl, VT, LHS,
-                                  DAG.getConstant(LoSize - 1, dl,
-                                                  TLI.getPointerTy()));
+      SDValue HiLHS =
+          DAG.getNode(ISD::SRA, dl, VT, RHS,
+                      DAG.getConstant(LoSize - 1, dl,
+                                      TLI.getPointerTy(DAG.getDataLayout())));
+      SDValue HiRHS =
+          DAG.getNode(ISD::SRA, dl, VT, LHS,
+                      DAG.getConstant(LoSize - 1, dl,
+                                      TLI.getPointerTy(DAG.getDataLayout())));
 
       // Here we're passing the 2 arguments explicitly as 4 arguments that are
       // pre-lowered to the correct types. This all depends upon WideVT not
@@ -3785,8 +3810,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     }
 
     if (isSigned) {
-      Tmp1 = DAG.getConstant(VT.getSizeInBits() - 1, dl,
-                             TLI.getShiftAmountTy(BottomHalf.getValueType()));
+      Tmp1 = DAG.getConstant(
+          VT.getSizeInBits() - 1, dl,
+          TLI.getShiftAmountTy(BottomHalf.getValueType(), DAG.getDataLayout()));
       Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, Tmp1);
       TopHalf = DAG.getSetCC(dl, getSetCCResultType(VT), TopHalf, Tmp1,
                              ISD::SETNE);
@@ -3802,9 +3828,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT PairTy = Node->getValueType(0);
     Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, PairTy, Node->getOperand(0));
     Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, PairTy, Node->getOperand(1));
-    Tmp2 = DAG.getNode(ISD::SHL, dl, PairTy, Tmp2,
-                       DAG.getConstant(PairTy.getSizeInBits()/2, dl,
-                                       TLI.getShiftAmountTy(PairTy)));
+    Tmp2 = DAG.getNode(
+        ISD::SHL, dl, PairTy, Tmp2,
+        DAG.getConstant(PairTy.getSizeInBits() / 2, dl,
+                        TLI.getShiftAmountTy(PairTy, DAG.getDataLayout())));
     Results.push_back(DAG.getNode(ISD::OR, dl, PairTy, Tmp1, Tmp2));
     break;
   }
@@ -3828,9 +3855,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     SDValue Table = Node->getOperand(1);
     SDValue Index = Node->getOperand(2);
 
-    EVT PTy = TLI.getPointerTy();
+    EVT PTy = TLI.getPointerTy(DAG.getDataLayout());
 
-    const DataLayout &TD = *TLI.getDataLayout();
+    const DataLayout &TD = DAG.getDataLayout();
     unsigned EntrySize =
       DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
 
@@ -3936,7 +3963,8 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       assert(!TLI.isOperationExpand(ISD::SELECT, VT) &&
              "Cannot expand ISD::SELECT_CC when ISD::SELECT also needs to be "
              "expanded.");
-      EVT CCVT = TLI.getSetCCResultType(*DAG.getContext(), CmpVT);
+      EVT CCVT =
+          TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
       SDValue Cond = DAG.getNode(ISD::SETCC, dl, CCVT, Tmp1, Tmp2, CC);
       Results.push_back(DAG.getSelect(dl, VT, Cond, Tmp3, Tmp4));
       break;
@@ -4036,14 +4064,12 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
 
     SmallVector<SDValue, 8> Scalars;
     for (unsigned Idx = 0; Idx < NumElem; Idx++) {
-      SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                               VT.getScalarType(),
-                               Node->getOperand(0),
-                               DAG.getConstant(Idx, dl, TLI.getVectorIdxTy()));
-      SDValue Sh = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                               VT.getScalarType(),
-                               Node->getOperand(1),
-                               DAG.getConstant(Idx, dl, TLI.getVectorIdxTy()));
+      SDValue Ex = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(), Node->getOperand(0),
+          DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Sh = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(), Node->getOperand(1),
+          DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,
                                     VT.getScalarType(), Ex, Sh));
     }
@@ -4114,9 +4140,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
     Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp1 = DAG.getNode(ISD::BSWAP, dl, NVT, Tmp1);
-    Tmp1 = DAG.getNode(ISD::SRL, dl, NVT, Tmp1,
-                       DAG.getConstant(DiffBits, dl,
-                                       TLI.getShiftAmountTy(NVT)));
+    Tmp1 = DAG.getNode(
+        ISD::SRL, dl, NVT, Tmp1,
+        DAG.getConstant(DiffBits, dl,
+                        TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
     Results.push_back(Tmp1);
     break;
   }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 37fdf4453fd4..3c50a4155731 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -218,29 +218,35 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
   unsigned RSize = RVT.getSizeInBits();
 
   // First get the sign bit of second operand.
-  SDValue SignBit = DAG.getNode(ISD::SHL, dl, RVT, DAG.getConstant(1, dl, RVT),
-                                  DAG.getConstant(RSize - 1, dl,
-                                                  TLI.getShiftAmountTy(RVT)));
+  SDValue SignBit = DAG.getNode(
+      ISD::SHL, dl, RVT, DAG.getConstant(1, dl, RVT),
+      DAG.getConstant(RSize - 1, dl,
+                      TLI.getShiftAmountTy(RVT, DAG.getDataLayout())));
   SignBit = DAG.getNode(ISD::AND, dl, RVT, RHS, SignBit);
 
   // Shift right or sign-extend it if the two operands have different types.
   int SizeDiff = RVT.getSizeInBits() - LVT.getSizeInBits();
   if (SizeDiff > 0) {
-    SignBit = DAG.getNode(ISD::SRL, dl, RVT, SignBit,
-                          DAG.getConstant(SizeDiff, dl,
-                                 TLI.getShiftAmountTy(SignBit.getValueType())));
+    SignBit =
+        DAG.getNode(ISD::SRL, dl, RVT, SignBit,
+                    DAG.getConstant(SizeDiff, dl,
+                                    TLI.getShiftAmountTy(SignBit.getValueType(),
+                                                         DAG.getDataLayout())));
     SignBit = DAG.getNode(ISD::TRUNCATE, dl, LVT, SignBit);
   } else if (SizeDiff < 0) {
     SignBit = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, SignBit);
-    SignBit = DAG.getNode(ISD::SHL, dl, LVT, SignBit,
-                          DAG.getConstant(-SizeDiff, dl,
-                                 TLI.getShiftAmountTy(SignBit.getValueType())));
+    SignBit =
+        DAG.getNode(ISD::SHL, dl, LVT, SignBit,
+                    DAG.getConstant(-SizeDiff, dl,
+                                    TLI.getShiftAmountTy(SignBit.getValueType(),
+                                                         DAG.getDataLayout())));
   }
 
   // Clear the sign bit of the first operand.
-  SDValue Mask = DAG.getNode(ISD::SHL, dl, LVT, DAG.getConstant(1, dl, LVT),
-                               DAG.getConstant(LSize - 1, dl,
-                                               TLI.getShiftAmountTy(LVT)));
+  SDValue Mask = DAG.getNode(
+      ISD::SHL, dl, LVT, DAG.getConstant(1, dl, LVT),
+      DAG.getConstant(LSize - 1, dl,
+                      TLI.getShiftAmountTy(LVT, DAG.getDataLayout())));
   Mask = DAG.getNode(ISD::SUB, dl, LVT, Mask, DAG.getConstant(1, dl, LVT));
   LHS = DAG.getNode(ISD::AND, dl, LVT, LHS, Mask);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index f41202c4f8a4..9f060a09a0f3 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -282,7 +282,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
     Lo = BitConvertToInteger(Lo);
     Hi = BitConvertToInteger(Hi);
 
-    if (TLI.isBigEndian())
+    if (DAG.getDataLayout().isBigEndian())
       std::swap(Lo, Hi);
 
     InOp = DAG.getNode(ISD::ANY_EXTEND, dl,
@@ -310,8 +310,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
   SDLoc dl(N);
 
   unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
-  return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
-                     DAG.getConstant(DiffBits, dl, TLI.getShiftAmountTy(NVT)));
+  return DAG.getNode(
+      ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
+      DAG.getConstant(DiffBits, dl,
+                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
@@ -799,7 +801,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
   }
 
   // Handle endianness of the load.
-  if (TLI.isBigEndian())
+  if (DAG.getDataLayout().isBigEndian())
     std::reverse(Parts.begin(), Parts.end());
 
   // Assemble the parts in the promoted type.
@@ -809,8 +811,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
     SDValue Part = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[i]);
     // Shift it to the right position and "or" it in.
     Part = DAG.getNode(ISD::SHL, dl, NVT, Part,
-                       DAG.getConstant(i*RegVT.getSizeInBits(), dl,
-                                       TLI.getPointerTy()));
+                       DAG.getConstant(i * RegVT.getSizeInBits(), dl,
+                                       TLI.getPointerTy(DAG.getDataLayout())));
     Res = DAG.getNode(ISD::OR, dl, NVT, Res, Part);
   }
 
@@ -1004,7 +1006,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) {
 
   Hi = DAG.getNode(ISD::SHL, dl, N->getValueType(0), Hi,
                    DAG.getConstant(OVT.getSizeInBits(), dl,
-                                   TLI.getPointerTy()));
+                                   TLI.getPointerTy(DAG.getDataLayout())));
   return DAG.getNode(ISD::OR, dl, N->getValueType(0), Lo, Hi);
 }
 
@@ -1063,7 +1065,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
 
   // Promote the index.
   SDValue Idx = DAG.getZExtOrTrunc(N->getOperand(2), SDLoc(N),
-                                   TLI.getVectorIdxTy());
+                                   TLI.getVectorIdxTy(DAG.getDataLayout()));
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 N->getOperand(1), Idx), 0);
 }
@@ -1356,9 +1358,9 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
   return ExpandChainLibCall(LC, Node, false);
 }
 
-/// ExpandShiftByConstant - N is a shift by a value that needs to be expanded,
+/// N is a shift by a value that needs to be expanded,
 /// and the shift amount is a constant 'Amt'.  Expand the operation.
-void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt,
+void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
                                              SDValue &Lo, SDValue &Hi) {
   SDLoc DL(N);
   // Expand the incoming operand to be shifted, so that we have its parts
@@ -1379,9 +1381,9 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt,
   EVT ShTy = N->getOperand(1).getValueType();
 
   if (N->getOpcode() == ISD::SHL) {
-    if (Amt > VTBits) {
+    if (Amt.ugt(VTBits)) {
       Lo = Hi = DAG.getConstant(0, DL, NVT);
-    } else if (Amt > NVTBits) {
+    } else if (Amt.ugt(NVTBits)) {
       Lo = DAG.getConstant(0, DL, NVT);
       Hi = DAG.getNode(ISD::SHL, DL,
                        NVT, InL, DAG.getConstant(Amt - NVTBits, DL, ShTy));
@@ -1403,16 +1405,15 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt,
                        DAG.getNode(ISD::SHL, DL, NVT, InH,
                                    DAG.getConstant(Amt, DL, ShTy)),
                        DAG.getNode(ISD::SRL, DL, NVT, InL,
-                                   DAG.getConstant(NVTBits - Amt, DL, ShTy)));
+                                   DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
     }
     return;
   }
 
   if (N->getOpcode() == ISD::SRL) {
-    if (Amt > VTBits) {
-      Lo = DAG.getConstant(0, DL, NVT);
-      Hi = DAG.getConstant(0, DL, NVT);
-    } else if (Amt > NVTBits) {
+    if (Amt.ugt(VTBits)) {
+      Lo = Hi = DAG.getConstant(0, DL, NVT);
+    } else if (Amt.ugt(NVTBits)) {
       Lo = DAG.getNode(ISD::SRL, DL,
                        NVT, InH, DAG.getConstant(Amt - NVTBits, DL, ShTy));
       Hi = DAG.getConstant(0, DL, NVT);
@@ -1424,19 +1425,19 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt,
                        DAG.getNode(ISD::SRL, DL, NVT, InL,
                                    DAG.getConstant(Amt, DL, ShTy)),
                        DAG.getNode(ISD::SHL, DL, NVT, InH,
-                                   DAG.getConstant(NVTBits - Amt, DL, ShTy)));
+                                   DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
       Hi = DAG.getNode(ISD::SRL, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy));
     }
     return;
   }
 
   assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
-  if (Amt > VTBits) {
+  if (Amt.ugt(VTBits)) {
     Hi = Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
                           DAG.getConstant(NVTBits - 1, DL, ShTy));
-  } else if (Amt > NVTBits) {
+  } else if (Amt.ugt(NVTBits)) {
     Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
-                     DAG.getConstant(Amt-NVTBits, DL, ShTy));
+                     DAG.getConstant(Amt - NVTBits, DL, ShTy));
     Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
                      DAG.getConstant(NVTBits - 1, DL, ShTy));
   } else if (Amt == NVTBits) {
@@ -1448,7 +1449,7 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt,
                      DAG.getNode(ISD::SRL, DL, NVT, InL,
                                  DAG.getConstant(Amt, DL, ShTy)),
                      DAG.getNode(ISD::SHL, DL, NVT, InH,
-                                 DAG.getConstant(NVTBits - Amt, DL, ShTy)));
+                                 DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
     Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy));
   }
 }
@@ -1808,7 +1809,8 @@ void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N,
     Lo = DAG.getNode(ISD::AssertSext, dl, NVT, Lo, DAG.getValueType(EVT));
     // The high part replicates the sign bit of Lo, make it explicit.
     Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
-                     DAG.getConstant(NVTBits - 1, dl, TLI.getPointerTy()));
+                     DAG.getConstant(NVTBits - 1, dl,
+                                     TLI.getPointerTy(DAG.getDataLayout())));
   }
 }
 
@@ -1975,7 +1977,8 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
       // lo part.
       unsigned LoSize = Lo.getValueType().getSizeInBits();
       Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
-                       DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy()));
+                       DAG.getConstant(LoSize - 1, dl,
+                                       TLI.getPointerTy(DAG.getDataLayout())));
     } else if (ExtType == ISD::ZEXTLOAD) {
       // The high part is just a zero.
       Hi = DAG.getConstant(0, dl, NVT);
@@ -1984,7 +1987,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
       // The high part is undefined.
       Hi = DAG.getUNDEF(NVT);
     }
-  } else if (TLI.isLittleEndian()) {
+  } else if (DAG.getDataLayout().isLittleEndian()) {
     // Little-endian - low bits are at low addresses.
     Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(),
                      isVolatile, isNonTemporal, isInvariant, Alignment,
@@ -2039,15 +2042,16 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
 
     if (ExcessBits < NVT.getSizeInBits()) {
       // Transfer low bits from the bottom of Hi to the top of Lo.
-      Lo = DAG.getNode(ISD::OR, dl, NVT, Lo,
-                       DAG.getNode(ISD::SHL, dl, NVT, Hi,
-                                   DAG.getConstant(ExcessBits, dl,
-                                                   TLI.getPointerTy())));
+      Lo = DAG.getNode(
+          ISD::OR, dl, NVT, Lo,
+          DAG.getNode(ISD::SHL, dl, NVT, Hi,
+                      DAG.getConstant(ExcessBits, dl,
+                                      TLI.getPointerTy(DAG.getDataLayout()))));
       // Move high bits to the right position in Hi.
-      Hi = DAG.getNode(ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, dl,
-                       NVT, Hi,
+      Hi = DAG.getNode(ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, dl, NVT,
+                       Hi,
                        DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl,
-                                       TLI.getPointerTy()));
+                                       TLI.getPointerTy(DAG.getDataLayout())));
     }
   }
 
@@ -2173,7 +2177,7 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
   // If we can emit an efficient shift operation, do so now.  Check to see if
   // the RHS is a constant.
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
-    return ExpandShiftByConstant(N, CN->getZExtValue(), Lo, Hi);
+    return ExpandShiftByConstant(N, CN->getAPIntValue(), Lo, Hi);
 
   // If we can determine that the high bit of the shift is zero or one, even if
   // the low bits are variable, emit this shift in an optimized form.
@@ -2206,7 +2210,7 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
     // have an illegal type.  Fix that first by casting the operand, otherwise
     // the new SHL_PARTS operation would need further legalization.
     SDValue ShiftOp = N->getOperand(1);
-    EVT ShiftTy = TLI.getShiftAmountTy(VT);
+    EVT ShiftTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
     assert(ShiftTy.getScalarType().getSizeInBits() >=
            Log2_32_Ceil(VT.getScalarType().getSizeInBits()) &&
            "ShiftAmountTy is too small to cover the range of this type!");
@@ -2276,8 +2280,9 @@ void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
     Lo = DAG.getNode(ISD::SIGN_EXTEND, dl, NVT, N->getOperand(0));
     // The high part is obtained by SRA'ing all but one of the bits of low part.
     unsigned LoSize = NVT.getSizeInBits();
-    Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
-                     DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy()));
+    Hi = DAG.getNode(
+        ISD::SRA, dl, NVT, Lo,
+        DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy(DAG.getDataLayout())));
   } else {
     // For example, extension of an i48 to an i64.  The operand type necessarily
     // promotes to the result type, so will end up being expanded too.
@@ -2312,7 +2317,7 @@ ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) {
     // things like sextinreg V:i64 from i8.
     Hi = DAG.getNode(ISD::SRA, dl, Hi.getValueType(), Lo,
                      DAG.getConstant(Hi.getValueType().getSizeInBits() - 1, dl,
-                                     TLI.getPointerTy()));
+                                     TLI.getPointerTy(DAG.getDataLayout())));
   } else {
     // For example, extension of an i48 to an i64.  Leave the low part alone,
     // sext_inreg the high part.
@@ -2355,10 +2360,10 @@ void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
   Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, N->getOperand(0));
-  Hi = DAG.getNode(ISD::SRL, dl,
-                   N->getOperand(0).getValueType(), N->getOperand(0),
+  Hi = DAG.getNode(ISD::SRL, dl, N->getOperand(0).getValueType(),
+                   N->getOperand(0),
                    DAG.getConstant(NVT.getSizeInBits(), dl,
-                                   TLI.getPointerTy()));
+                                   TLI.getPointerTy(DAG.getDataLayout())));
   Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi);
 }
 
@@ -2414,7 +2419,7 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
   }
 
   Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
-  EVT PtrVT = TLI.getPointerTy();
+  EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext());
 
   // Replace this with a libcall that will check overflow.
@@ -2845,7 +2850,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
                              Alignment, AAInfo);
   }
 
-  if (TLI.isLittleEndian()) {
+  if (DAG.getDataLayout().isLittleEndian()) {
     // Little-endian - low bits are at low addresses.
     GetExpandedInteger(N->getValue(), Lo, Hi);
 
@@ -2882,11 +2887,12 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     // Transfer high bits from the top of Lo to the bottom of Hi.
     Hi = DAG.getNode(ISD::SHL, dl, NVT, Hi,
                      DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl,
-                                     TLI.getPointerTy()));
-    Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
-                     DAG.getNode(ISD::SRL, dl, NVT, Lo,
-                                 DAG.getConstant(ExcessBits, dl,
-                                                 TLI.getPointerTy())));
+                                     TLI.getPointerTy(DAG.getDataLayout())));
+    Hi = DAG.getNode(
+        ISD::OR, dl, NVT, Hi,
+        DAG.getNode(ISD::SRL, dl, NVT, Lo,
+                    DAG.getConstant(ExcessBits, dl,
+                                    TLI.getPointerTy(DAG.getDataLayout()))));
   }
 
   // Store both the high bits and maybe some of the low bits.
@@ -2956,14 +2962,15 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
                                    ISD::SETLT);
 
     // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
-    SDValue FudgePtr = DAG.getConstantPool(
-                               ConstantInt::get(*DAG.getContext(), FF.zext(64)),
-                                           TLI.getPointerTy());
+    SDValue FudgePtr =
+        DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF.zext(64)),
+                            TLI.getPointerTy(DAG.getDataLayout()));
 
     // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
     SDValue Zero = DAG.getIntPtrConstant(0, dl);
     SDValue Four = DAG.getIntPtrConstant(4, dl);
-    if (TLI.isBigEndian()) std::swap(Zero, Four);
+    if (DAG.getDataLayout().isBigEndian())
+      std::swap(Zero, Four);
     SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet,
                                    Zero, Four);
     unsigned Alignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlignment();
@@ -3113,9 +3120,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
   for (unsigned i = 0; i < NumOperands; ++i) {
     SDValue Op = N->getOperand(i);
     for (unsigned j = 0; j < NumElem; ++j) {
-      SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                                InElemTy, Op, DAG.getConstant(j, dl,
-                                              TLI.getVectorIdxTy()));
+      SDValue Ext = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, InElemTy, Op,
+          DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       Ops[i * NumElem + j] = DAG.getNode(ISD::ANY_EXTEND, dl, OutElemTy, Ext);
     }
   }
@@ -3142,7 +3149,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDLoc dl(N);
   SDValue V0 = GetPromotedInteger(N->getOperand(0));
-  SDValue V1 = DAG.getZExtOrTrunc(N->getOperand(1), dl, TLI.getVectorIdxTy());
+  SDValue V1 = DAG.getZExtOrTrunc(N->getOperand(1), dl,
+                                  TLI.getVectorIdxTy(DAG.getDataLayout()));
   SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
     V0->getValueType(0).getScalarType(), V0, V1);
 
@@ -3179,8 +3187,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
 
     for (unsigned i=0; i<NumElem; ++i) {
       // Extract element from incoming vector
-      SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SclrTy,
-      Incoming, DAG.getConstant(i, dl, TLI.getVectorIdxTy()));
+      SDValue Ex = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, SclrTy, Incoming,
+          DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       SDValue Tr = DAG.getNode(ISD::TRUNCATE, dl, RetSclrTy, Ex);
       NewOps.push_back(Tr);
     }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 9c297698c1db..a7392fabf1e7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -1006,7 +1006,7 @@ SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, EVT EltVT,
                                                   SDValue Index) {
   SDLoc dl(Index);
   // Make sure the index type is big enough to compute in.
-  Index = DAG.getZExtOrTrunc(Index, dl, TLI.getPointerTy());
+  Index = DAG.getZExtOrTrunc(Index, dl, TLI.getPointerTy(DAG.getDataLayout()));
 
   // Calculate the element offset and add it to the pointer.
   unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
@@ -1030,7 +1030,7 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
   Hi = DAG.getNode(ISD::ANY_EXTEND, dlHi, NVT, Hi);
   Hi = DAG.getNode(ISD::SHL, dlHi, NVT, Hi,
                    DAG.getConstant(LVT.getSizeInBits(), dlHi,
-                                   TLI.getPointerTy()));
+                                   TLI.getPointerTy(DAG.getDataLayout())));
   return DAG.getNode(ISD::OR, dlHi, NVT, Lo, Hi);
 }
 
@@ -1079,7 +1079,7 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
-                                         TLI.getPointerTy());
+                                         TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
 
@@ -1117,7 +1117,7 @@ void DAGTypeLegalizer::SplitInteger(SDValue Op,
   Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Op);
   Hi = DAG.getNode(ISD::SRL, dl, Op.getValueType(), Op,
                    DAG.getConstant(LoVT.getSizeInBits(), dl,
-                                   TLI.getPointerTy()));
+                                   TLI.getPointerTy(DAG.getDataLayout())));
   Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
 }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 2f2778982611..d1131a74cf17 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -73,7 +73,7 @@ private:
   }
 
   EVT getSetCCResultType(EVT VT) const {
-    return TLI.getSetCCResultType(*DAG.getContext(), VT);
+    return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   }
 
   /// IgnoreNodeResults - Pretend all of this node's results are legal.
@@ -167,7 +167,7 @@ private:
   SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index);
   SDValue JoinIntegers(SDValue Lo, SDValue Hi);
   SDValue LibCallify(RTLIB::Libcall LC, SDNode *N, bool isSigned);
-  
+
   std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
                                                  SDNode *Node, bool isSigned);
   std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
@@ -347,7 +347,7 @@ private:
 
   void ExpandIntRes_ATOMIC_LOAD       (SDNode *N, SDValue &Lo, SDValue &Hi);
 
-  void ExpandShiftByConstant(SDNode *N, unsigned Amt,
+  void ExpandShiftByConstant(SDNode *N, const APInt &Amt,
                              SDValue &Lo, SDValue &Hi);
   bool ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
   bool ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 330c31ce0eec..14d8f7762086 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -60,18 +60,20 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
     case TargetLowering::TypeExpandInteger:
-    case TargetLowering::TypeExpandFloat:
+    case TargetLowering::TypeExpandFloat: {
+      auto &DL = DAG.getDataLayout();
       // Convert the expanded pieces of the input.
       GetExpandedOp(InOp, Lo, Hi);
-      if (TLI.hasBigEndianPartOrdering(InVT) !=
-          TLI.hasBigEndianPartOrdering(OutVT))
+      if (TLI.hasBigEndianPartOrdering(InVT, DL) !=
+          TLI.hasBigEndianPartOrdering(OutVT, DL))
         std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
+    }
     case TargetLowering::TypeSplitVector:
       GetSplitVector(InOp, Lo, Hi);
-      if (TLI.hasBigEndianPartOrdering(OutVT))
+      if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
         std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
@@ -88,7 +90,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
       EVT LoVT, HiVT;
       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(InVT);
       std::tie(Lo, Hi) = DAG.SplitVector(InOp, dl, LoVT, HiVT);
-      if (TLI.hasBigEndianPartOrdering(OutVT))
+      if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
         std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
@@ -119,9 +121,9 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
       SmallVector<SDValue, 8> Vals;
       for (unsigned i = 0; i < NumElems; ++i)
-        Vals.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemVT,
-                                   CastInOp, DAG.getConstant(i, dl,
-                                             TLI.getVectorIdxTy())));
+        Vals.push_back(DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, ElemVT, CastInOp,
+            DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
 
       // Build Lo, Hi pair by pairing extracted elements if needed.
       unsigned Slot = 0;
@@ -131,7 +133,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
         SDValue LHS = Vals[Slot];
         SDValue RHS = Vals[Slot + 1];
 
-        if (TLI.isBigEndian())
+        if (DAG.getDataLayout().isBigEndian())
           std::swap(LHS, RHS);
 
         Vals.push_back(DAG.getNode(ISD::BUILD_PAIR, dl,
@@ -143,7 +145,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
       Lo = Vals[Slot++];
       Hi = Vals[Slot++];
 
-      if (TLI.isBigEndian())
+      if (DAG.getDataLayout().isBigEndian())
         std::swap(Lo, Hi);
 
       return;
@@ -155,9 +157,8 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   // Create the stack frame object.  Make sure it is aligned for both
   // the source and expanded destination types.
-  unsigned Alignment =
-    TLI.getDataLayout()->getPrefTypeAlignment(NOutVT.
-                                              getTypeForEVT(*DAG.getContext()));
+  unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(
+      NOutVT.getTypeForEVT(*DAG.getContext()));
   SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SPFI);
@@ -182,7 +183,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
                    false, false, MinAlign(Alignment, IncrementSize));
 
   // Handle endianness of the load.
-  if (TLI.hasBigEndianPartOrdering(OutVT))
+  if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
     std::swap(Lo, Hi);
 }
 
@@ -241,7 +242,7 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
                     DAG.getConstant(1, dl, Idx.getValueType()));
   Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
 
-  if (TLI.isBigEndian())
+  if (DAG.getDataLayout().isBigEndian())
     std::swap(Lo, Hi);
 }
 
@@ -282,7 +283,7 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
                       Hi.getValue(1));
 
   // Handle endianness of the load.
-  if (TLI.hasBigEndianPartOrdering(ValueVT))
+  if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
     std::swap(Lo, Hi);
 
   // Modified the chain - switch anything that used the old chain to use
@@ -302,7 +303,7 @@ void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
   Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2), 0);
 
   // Handle endianness of the load.
-  if (TLI.hasBigEndianPartOrdering(OVT))
+  if (TLI.hasBigEndianPartOrdering(OVT, DAG.getDataLayout()))
     std::swap(Lo, Hi);
 
   // Modified the chain - switch anything that used the old chain to use
@@ -325,7 +326,7 @@ void DAGTypeLegalizer::IntegerToVector(SDValue Op, unsigned NumElements,
   if (NumElements > 1) {
     NumElements >>= 1;
     SplitInteger(Op, Parts[0], Parts[1]);
-      if (TLI.isBigEndian())
+    if (DAG.getDataLayout().isBigEndian())
         std::swap(Parts[0], Parts[1]);
     IntegerToVector(Parts[0], NumElements, Ops, EltVT);
     IntegerToVector(Parts[1], NumElements, Ops, EltVT);
@@ -389,7 +390,7 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue Lo, Hi;
     GetExpandedOp(N->getOperand(i), Lo, Hi);
-    if (TLI.isBigEndian())
+    if (DAG.getDataLayout().isBigEndian())
       std::swap(Lo, Hi);
     NewElts.push_back(Lo);
     NewElts.push_back(Hi);
@@ -431,7 +432,7 @@ SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
 
   SDValue Lo, Hi;
   GetExpandedOp(Val, Lo, Hi);
-  if (TLI.isBigEndian())
+  if (DAG.getDataLayout().isBigEndian())
     std::swap(Lo, Hi);
 
   SDValue Idx = N->getOperand(2);
@@ -481,7 +482,7 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
   SDValue Lo, Hi;
   GetExpandedOp(St->getValue(), Lo, Hi);
 
-  if (TLI.hasBigEndianPartOrdering(ValueVT))
+  if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
     std::swap(Lo, Hi);
 
   Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(),
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index ee844a8a4c58..83d4ad5ea1f4 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -503,7 +503,7 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
     // Instead, we load all significant words, mask bits off, and concatenate
     // them to form each element. Finally, they are extended to destination
     // scalar type to build the destination vector.
-    EVT WideVT = TLI.getPointerTy();
+    EVT WideVT = TLI.getPointerTy(DAG.getDataLayout());
 
     assert(WideVT.isRound() &&
            "Could not handle the sophisticated case when the widest integer is"
@@ -563,7 +563,8 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
       SDValue Lo, Hi, ShAmt;
 
       if (BitOffset < WideBits) {
-        ShAmt = DAG.getConstant(BitOffset, dl, TLI.getShiftAmountTy(WideVT));
+        ShAmt = DAG.getConstant(
+            BitOffset, dl, TLI.getShiftAmountTy(WideVT, DAG.getDataLayout()));
         Lo = DAG.getNode(ISD::SRL, dl, WideVT, LoadVals[WideIdx], ShAmt);
         Lo = DAG.getNode(ISD::AND, dl, WideVT, Lo, SrcEltBitMask);
       }
@@ -573,8 +574,9 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
         WideIdx++;
         BitOffset -= WideBits;
         if (BitOffset > 0) {
-          ShAmt = DAG.getConstant(SrcEltBits - BitOffset, dl,
-                                  TLI.getShiftAmountTy(WideVT));
+          ShAmt = DAG.getConstant(
+              SrcEltBits - BitOffset, dl,
+              TLI.getShiftAmountTy(WideVT, DAG.getDataLayout()));
           Hi = DAG.getNode(ISD::SHL, dl, WideVT, LoadVals[WideIdx], ShAmt);
           Hi = DAG.getNode(ISD::AND, dl, WideVT, Hi, SrcEltBitMask);
         }
@@ -592,8 +594,9 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
         Lo = DAG.getZExtOrTrunc(Lo, dl, DstEltVT);
         break;
       case ISD::SEXTLOAD:
-        ShAmt = DAG.getConstant(WideBits - SrcEltBits, dl,
-                                TLI.getShiftAmountTy(WideVT));
+        ShAmt =
+            DAG.getConstant(WideBits - SrcEltBits, dl,
+                            TLI.getShiftAmountTy(WideVT, DAG.getDataLayout()));
         Lo = DAG.getNode(ISD::SHL, dl, WideVT, Lo, ShAmt);
         Lo = DAG.getNode(ISD::SRA, dl, WideVT, Lo, ShAmt);
         Lo = DAG.getSExtOrTrunc(Lo, dl, DstEltVT);
@@ -663,8 +666,9 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
   // and save them into memory individually.
   SmallVector<SDValue, 8> Stores;
   for (unsigned Idx = 0; Idx < NumElem; Idx++) {
-    SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-               RegSclVT, Value, DAG.getConstant(Idx, dl, TLI.getVectorIdxTy()));
+    SDValue Ex = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, RegSclVT, Value,
+        DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
     // This scalar TruncStore may be illegal, but we legalize it later.
     SDValue Store = DAG.getTruncStore(Chain, dl, Ex, BasePTR,
@@ -803,7 +807,7 @@ SDValue VectorLegalizer::ExpandANY_EXTEND_VECTOR_INREG(SDValue Op) {
 
   // Place the extended lanes into the correct locations.
   int ExtLaneScale = NumSrcElements / NumElements;
-  int EndianOffset = TLI.isBigEndian() ? ExtLaneScale - 1 : 0;
+  int EndianOffset = DAG.getDataLayout().isBigEndian() ? ExtLaneScale - 1 : 0;
   for (int i = 0; i < NumElements; ++i)
     ShuffleMask[i * ExtLaneScale + EndianOffset] = i;
 
@@ -858,7 +862,7 @@ SDValue VectorLegalizer::ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op) {
     ShuffleMask.push_back(i);
 
   int ExtLaneScale = NumSrcElements / NumElements;
-  int EndianOffset = TLI.isBigEndian() ? ExtLaneScale - 1 : 0;
+  int EndianOffset = DAG.getDataLayout().isBigEndian() ? ExtLaneScale - 1 : 0;
   for (int i = 0; i < NumElements; ++i)
     ShuffleMask[i * ExtLaneScale + EndianOffset] = NumSrcElements + i;
 
@@ -995,12 +999,15 @@ SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) {
   SDLoc dl(Op);
   SmallVector<SDValue, 8> Ops(NumElems);
   for (unsigned i = 0; i < NumElems; ++i) {
-    SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
-                                  DAG.getConstant(i, dl, TLI.getVectorIdxTy()));
-    SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
-                                  DAG.getConstant(i, dl, TLI.getVectorIdxTy()));
+    SDValue LHSElem = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
+        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    SDValue RHSElem = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
+        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
     Ops[i] = DAG.getNode(ISD::SETCC, dl,
-                         TLI.getSetCCResultType(*DAG.getContext(), TmpEltVT),
+                         TLI.getSetCCResultType(DAG.getDataLayout(),
+                                                *DAG.getContext(), TmpEltVT),
                          LHSElem, RHSElem, CC);
     Ops[i] = DAG.getSelect(dl, EltVT, Ops[i],
                            DAG.getConstant(APInt::getAllOnesValue
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 905492c202ca..4348ab79f7d1 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -251,8 +251,9 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
     Op = GetScalarizedVector(Op);
   } else {
     EVT VT = OpVT.getVectorElementType();
-    Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
-                      DAG.getConstant(0, DL, TLI.getVectorIdxTy()));
+    Op = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
   return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op);
 }
@@ -384,10 +385,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VSETCC(SDNode *N) {
     RHS = GetScalarizedVector(RHS);
   } else {
     EVT VT = OpVT.getVectorElementType();
-    LHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, LHS,
-                      DAG.getConstant(0, DL, TLI.getVectorIdxTy()));
-    RHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, RHS,
-                      DAG.getConstant(0, DL, TLI.getVectorIdxTy()));
+    LHS = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, VT, LHS,
+        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    RHS = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, VT, RHS,
+        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
 
   // Turn it into a scalar SETCC.
@@ -742,7 +745,7 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
     // expanded pieces.
     if (LoVT == HiVT) {
       GetExpandedOp(InOp, Lo, Hi);
-      if (TLI.isBigEndian())
+      if (DAG.getDataLayout().isBigEndian())
         std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
@@ -761,12 +764,12 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
   // In the general case, convert the input to an integer and split it by hand.
   EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
   EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
-  if (TLI.isBigEndian())
+  if (DAG.getDataLayout().isBigEndian())
     std::swap(LoIntVT, HiIntVT);
 
   SplitInteger(BitConvertToInteger(InOp), LoIntVT, HiIntVT, Lo, Hi);
 
-  if (TLI.isBigEndian())
+  if (DAG.getDataLayout().isBigEndian())
     std::swap(Lo, Hi);
   Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
   Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
@@ -819,7 +822,7 @@ void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
   uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec,
                    DAG.getConstant(IdxVal + LoVT.getVectorNumElements(), dl,
-                                   TLI.getVectorIdxTy()));
+                                   TLI.getVectorIdxTy(DAG.getDataLayout())));
 }
 
 void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
@@ -840,7 +843,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
   // Store the new subvector into the specified index.
   SDValue SubVecPtr = GetVectorElementPointer(StackPtr, SubVecVT, Idx);
   Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
-  unsigned Alignment = TLI.getDataLayout()->getPrefTypeAlignment(VecType);
+  unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
   Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo(),
                        false, false, 0);
 
@@ -898,9 +901,10 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
       Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
                        Lo.getValueType(), Lo, Elt, Idx);
     else
-      Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
-                       DAG.getConstant(IdxVal - LoNumElts, dl,
-                                       TLI.getVectorIdxTy()));
+      Hi =
+          DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
+                      DAG.getConstant(IdxVal - LoNumElts, dl,
+                                      TLI.getVectorIdxTy(DAG.getDataLayout())));
     return;
   }
 
@@ -919,8 +923,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   // so use a truncating store.
   SDValue EltPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
   Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
-  unsigned Alignment =
-    TLI.getDataLayout()->getPrefTypeAlignment(VecType);
+  unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
   Store = DAG.getTruncStore(Store, dl, Elt, EltPtr, MachinePointerInfo(), EltVT,
                             false, false, 0);
 
@@ -1292,10 +1295,9 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
         Idx -= Input * NewElts;
 
         // Extract the vector element by hand.
-        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
-                                    Inputs[Input],
-                                    DAG.getConstant(Idx, dl,
-                                                    TLI.getVectorIdxTy())));
+        SVOps.push_back(DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Inputs[Input],
+            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
       }
 
       // Construct the Lo/Hi output using a BUILD_VECTOR.
@@ -1472,7 +1474,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) {
   Lo = BitConvertToInteger(Lo);
   Hi = BitConvertToInteger(Hi);
 
-  if (TLI.isBigEndian())
+  if (DAG.getDataLayout().isBigEndian())
     std::swap(Lo, Hi);
 
   return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
@@ -1763,9 +1765,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
   for (const SDValue &Op : N->op_values()) {
     for (unsigned i = 0, e = Op.getValueType().getVectorNumElements();
          i != e; ++i) {
-      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
-                                 Op, DAG.getConstant(i, DL, TLI.getVectorIdxTy())));
-
+      Elts.push_back(DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
+          DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))));
     }
   }
 
@@ -1829,10 +1831,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
   // type. This should normally be something that ends up being legal directly,
   // but in theory if a target has very wide vectors and an annoyingly
   // restricted set of legal types, this split can chain to build things up.
-  return IsFloat ?
-    DAG.getNode(ISD::FP_ROUND, DL, OutVT, InterVec,
-                DAG.getTargetConstant(0, DL, TLI.getPointerTy())) :
-    DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
+  return IsFloat
+             ? DAG.getNode(ISD::FP_ROUND, DL, OutVT, InterVec,
+                           DAG.getTargetConstant(
+                               0, DL, TLI.getPointerTy(DAG.getDataLayout())))
+             : DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
@@ -2062,12 +2065,12 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
   // }
   while (CurNumElts != 0) {
     while (CurNumElts >= NumElts) {
-      SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
-                                 DAG.getConstant(Idx, dl,
-                                                 TLI.getVectorIdxTy()));
-      SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
-                                 DAG.getConstant(Idx, dl,
-                                                 TLI.getVectorIdxTy()));
+      SDValue EOp1 = DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
+          DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      SDValue EOp2 = DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
+          DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2);
       Idx += NumElts;
       CurNumElts -= NumElts;
@@ -2079,14 +2082,12 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
 
     if (NumElts == 1) {
       for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
-        SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
-                                   InOp1,
-                                   DAG.getConstant(Idx, dl,
-                                                   TLI.getVectorIdxTy()));
-        SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
-                                   InOp2,
-                                   DAG.getConstant(Idx, dl,
-                                                   TLI.getVectorIdxTy()));
+        SDValue EOp1 = DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp1,
+            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+        SDValue EOp2 = DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp2,
+            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
         ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
                                              EOp1, EOp2);
       }
@@ -2123,9 +2124,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
       SDValue VecOp = DAG.getUNDEF(NextVT);
       unsigned NumToInsert = ConcatEnd - Idx - 1;
       for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
-        VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp,
-                            ConcatOps[OpIdx],
-                            DAG.getConstant(i, dl, TLI.getVectorIdxTy()));
+        VecOp = DAG.getNode(
+            ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp, ConcatOps[OpIdx],
+            DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       }
       ConcatOps[Idx+1] = VecOp;
       ConcatEnd = Idx + 2;
@@ -2211,8 +2212,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
     }
 
     if (InVTNumElts % WidenNumElts == 0) {
-      SDValue InVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InWidenVT, InOp,
-                                  DAG.getConstant(0, DL, TLI.getVectorIdxTy()));
+      SDValue InVal = DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, DL, InWidenVT, InOp,
+          DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
       // Extract the input and convert the shorten input vector.
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InVal);
@@ -2226,8 +2228,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
   unsigned i;
   for (i=0; i < MinElts; ++i) {
-    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
-                              DAG.getConstant(i, DL, TLI.getVectorIdxTy()));
+    SDValue Val = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
+        DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
     if (N->getNumOperands() == 1)
       Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val);
     else
@@ -2453,8 +2456,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
     if (InputWidened)
       InOp = GetWidenedVector(InOp);
     for (unsigned j=0; j < NumInElts; ++j)
-      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-                               DAG.getConstant(j, dl, TLI.getVectorIdxTy()));
+      Ops[Idx++] = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+          DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; Idx < WidenNumElts; ++Idx)
@@ -2511,8 +2515,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
 
     if (InVTNumElts % WidenNumElts == 0) {
       // Extract the input and convert the shorten input vector.
-      InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp,
-                         DAG.getConstant(0, dl, TLI.getVectorIdxTy()));
+      InOp = DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp,
+          DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
       return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
                                   SatOp, CvtCode);
     }
@@ -2527,8 +2532,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
   unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
   unsigned i;
   for (i=0; i < MinElts; ++i) {
-    SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
-                                 DAG.getConstant(i, dl, TLI.getVectorIdxTy()));
+    SDValue ExtVal = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
     Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp,
                                   SatOp, CvtCode);
   }
@@ -2570,8 +2576,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   unsigned NumElts = VT.getVectorNumElements();
   unsigned i;
   for (i=0; i < NumElts; ++i)
-    Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-                         DAG.getConstant(IdxVal + i, dl, TLI.getVectorIdxTy()));
+    Ops[i] =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                    DAG.getConstant(IdxVal + i, dl,
+                                    TLI.getVectorIdxTy(DAG.getDataLayout())));
 
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; i < WidenNumElts; ++i)
@@ -2872,12 +2880,13 @@ SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
         assert(FixedVT.getVectorNumElements() != InVT.getVectorNumElements() &&
                "We can't have the same type as we started with!");
         if (FixedVT.getVectorNumElements() > InVT.getVectorNumElements())
-          InOp = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, FixedVT,
-                             DAG.getUNDEF(FixedVT), InOp,
-                             DAG.getConstant(0, DL, TLI.getVectorIdxTy()));
+          InOp = DAG.getNode(
+              ISD::INSERT_SUBVECTOR, DL, FixedVT, DAG.getUNDEF(FixedVT), InOp,
+              DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
         else
-          InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FixedVT, InOp,
-                             DAG.getConstant(0, DL, TLI.getVectorIdxTy()));
+          InOp = DAG.getNode(
+              ISD::EXTRACT_SUBVECTOR, DL, FixedVT, InOp,
+              DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
         break;
       }
     }
@@ -2920,10 +2929,11 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
   unsigned Opcode = N->getOpcode();
   SmallVector<SDValue, 16> Ops(NumElts);
   for (unsigned i=0; i < NumElts; ++i)
-    Ops[i] = DAG.getNode(Opcode, dl, EltVT,
-                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
-                                     DAG.getConstant(i, dl,
-                                                     TLI.getVectorIdxTy())));
+    Ops[i] = DAG.getNode(
+        Opcode, dl, EltVT,
+        DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+            DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
 
   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
@@ -2943,8 +2953,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
     if (TLI.isTypeLegal(NewVT)) {
       SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
-                         DAG.getConstant(0, dl, TLI.getVectorIdxTy()));
+      return DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
+          DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
     }
   }
 
@@ -2971,8 +2982,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
     if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
       InOp = GetWidenedVector(InOp);
     for (unsigned j=0; j < NumInElts; ++j)
-      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-                               DAG.getConstant(j, dl, TLI.getVectorIdxTy()));
+      Ops[Idx++] = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+          DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
@@ -3053,7 +3065,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
 
   // Get a new SETCC node to compare the newly widened operands.
   // Only some of the compared elements are legal.
-  EVT SVT = TLI.getSetCCResultType(*DAG.getContext(), InOp0.getValueType());
+  EVT SVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                   InOp0.getValueType());
   SDValue WideSETCC = DAG.getNode(ISD::SETCC, SDLoc(N),
                      SVT, InOp0, InOp1, N->getOperand(2));
 
@@ -3061,9 +3074,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   EVT ResVT = EVT::getVectorVT(*DAG.getContext(),
                                SVT.getVectorElementType(),
                                N->getValueType(0).getVectorNumElements());
-  SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
-                           ResVT, WideSETCC,
-                           DAG.getConstant(0, dl, TLI.getVectorIdxTy()));
+  SDValue CC = DAG.getNode(
+      ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC,
+      DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
   return PromoteTargetBoolean(CC, N->getValueType(0));
 }
@@ -3159,8 +3172,9 @@ static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
       Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits();
       LdTy = NewLdTy;
     }
-    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
-                        DAG.getConstant(Idx++, dl, TLI.getVectorIdxTy()));
+    VecOp = DAG.getNode(
+        ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
+        DAG.getConstant(Idx++, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
   return DAG.getNode(ISD::BITCAST, dl, VecTy, VecOp);
 }
@@ -3407,9 +3421,9 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
     if (NewVT.isVector()) {
       unsigned NumVTElts = NewVT.getVectorNumElements();
       do {
-        SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
-                                  DAG.getConstant(Idx, dl,
-                                                  TLI.getVectorIdxTy()));
+        SDValue EOp = DAG.getNode(
+            ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
+            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
         StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
                                     ST->getPointerInfo().getWithOffset(Offset),
                                        isVolatile, isNonTemporal,
@@ -3429,8 +3443,10 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
       // Readjust index position based on new vector type
       Idx = Idx * ValEltWidth / NewVTWidth;
       do {
-        SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
-                      DAG.getConstant(Idx++, dl, TLI.getVectorIdxTy()));
+        SDValue EOp = DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
+            DAG.getConstant(Idx++, dl,
+                            TLI.getVectorIdxTy(DAG.getDataLayout())));
         StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
                                     ST->getPointerInfo().getWithOffset(Offset),
                                        isVolatile, isNonTemporal,
@@ -3476,8 +3492,9 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
   EVT ValEltVT = ValVT.getVectorElementType();
   unsigned Increment = ValEltVT.getSizeInBits() / 8;
   unsigned NumElts = StVT.getVectorNumElements();
-  SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
-                            DAG.getConstant(0, dl, TLI.getVectorIdxTy()));
+  SDValue EOp = DAG.getNode(
+      ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
+      DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr,
                                       ST->getPointerInfo(), StEltVT,
                                       isVolatile, isNonTemporal, Align,
@@ -3488,8 +3505,9 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
                                      BasePtr,
                                      DAG.getConstant(Offset, dl,
                                                      BasePtr.getValueType()));
-    SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
-                            DAG.getConstant(0, dl, TLI.getVectorIdxTy()));
+    SDValue EOp = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
+        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
     StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr,
                                       ST->getPointerInfo().getWithOffset(Offset),
                                         StEltVT, isVolatile, isNonTemporal,
@@ -3525,8 +3543,9 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
   }
 
   if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
-                       DAG.getConstant(0, dl, TLI.getVectorIdxTy()));
+    return DAG.getNode(
+        ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
+        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
   // Fall back to extract and build.
   SmallVector<SDValue, 16> Ops(WidenNumElts);
@@ -3534,8 +3553,9 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
   unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
   unsigned Idx;
   for (Idx = 0; Idx < MinNumElts; ++Idx)
-    Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-                           DAG.getConstant(Idx, dl, TLI.getVectorIdxTy()));
+    Ops[Idx] = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+        DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for ( ; Idx < WidenNumElts; ++Idx)
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 00cbae3986cd..34e1a7001082 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -725,9 +725,8 @@ void ScheduleDAGLinearize::Schedule() {
 
   SmallVector<SDNode*, 8> Glues;
   unsigned DAGSize = 0;
-  for (SelectionDAG::allnodes_iterator I = DAG->allnodes_begin(),
-         E = DAG->allnodes_end(); I != E; ++I) {
-    SDNode *N = I;
+  for (SDNode &Node : DAG->allnodes()) {
+    SDNode *N = &Node;
 
     // Use node id to record degree.
     unsigned Degree = N->use_size();
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index b22d6edd85af..2a6c853a1d11 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -289,9 +289,8 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
 /// ClusterNodes - Cluster certain nodes which should be scheduled together.
 ///
 void ScheduleDAGSDNodes::ClusterNodes() {
-  for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(),
-       E = DAG->allnodes_end(); NI != E; ++NI) {
-    SDNode *Node = &*NI;
+  for (SDNode &NI : DAG->allnodes()) {
+    SDNode *Node = &NI;
     if (!Node || !Node->isMachineOpcode())
       continue;
 
@@ -308,9 +307,8 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
   // to their associated SUnits by holding SUnits table indices. A value
   // of -1 means the SDNode does not yet have an associated SUnit.
   unsigned NumNodes = 0;
-  for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(),
-       E = DAG->allnodes_end(); NI != E; ++NI) {
-    NI->setNodeId(-1);
+  for (SDNode &NI : DAG->allnodes()) {
+    NI.setNodeId(-1);
     ++NumNodes;
   }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index be5478275f99..14f44ccc60ce 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -151,8 +151,8 @@ bool ISD::isBuildVectorAllZeros(const SDNode *N) {
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
 
   bool IsAllUndef = true;
-  for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) {
-    if (N->getOperand(i).getOpcode() == ISD::UNDEF)
+  for (const SDValue &Op : N->op_values()) {
+    if (Op.getOpcode() == ISD::UNDEF)
       continue;
     IsAllUndef = false;
     // Do not accept build_vectors that aren't all constants or which have non-0
@@ -163,12 +163,11 @@ bool ISD::isBuildVectorAllZeros(const SDNode *N) {
     // We only want to check enough bits to cover the vector elements, because
     // we care if the resultant vector is all zeros, not whether the individual
     // constants are.
-    SDValue Zero = N->getOperand(i);
     unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
-    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Zero)) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op)) {
       if (CN->getAPIntValue().countTrailingZeros() < EltSize)
         return false;
-    } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Zero)) {
+    } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Op)) {
       if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize)
         return false;
     } else
@@ -921,7 +920,7 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
                    PointerType::get(Type::getInt8Ty(*getContext()), 0) :
                    VT.getTypeForEVT(*getContext());
 
-  return TLI->getDataLayout()->getABITypeAlignment(Ty);
+  return getDataLayout().getABITypeAlignment(Ty);
 }
 
 // EntryNode could meaningfully have debug info if we can find it...
@@ -1184,7 +1183,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, SDLoc DL, EVT VT,
 
     // EltParts is currently in little endian order. If we actually want
     // big-endian order then reverse it now.
-    if (TLI->isBigEndian())
+    if (getDataLayout().isBigEndian())
       std::reverse(EltParts.begin(), EltParts.end());
 
     // The elements must be reversed when the element order is different
@@ -1234,7 +1233,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, SDLoc DL, EVT VT,
 }
 
 SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, SDLoc DL, bool isTarget) {
-  return getConstant(Val, DL, TLI->getPointerTy(), isTarget);
+  return getConstant(Val, DL, TLI->getPointerTy(getDataLayout()), isTarget);
 }
 
 SDValue SelectionDAG::getConstantFP(const APFloat& V, SDLoc DL, EVT VT,
@@ -1303,7 +1302,7 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
          "Cannot set target flags on target-independent globals");
 
   // Truncate (with sign-extension) the offset value to the pointer size.
-  unsigned BitWidth = TLI->getPointerTypeSizeInBits(GV->getType());
+  unsigned BitWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
   if (BitWidth < 64)
     Offset = SignExtend64(Offset, BitWidth);
 
@@ -1373,7 +1372,7 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
-    Alignment = TLI->getDataLayout()->getPrefTypeAlignment(C->getType());
+    Alignment = getDataLayout().getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
@@ -1400,7 +1399,7 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
-    Alignment = TLI->getDataLayout()->getPrefTypeAlignment(C->getType());
+    Alignment = getDataLayout().getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
@@ -1850,7 +1849,7 @@ SDValue SelectionDAG::getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr,
 /// the target's desired shift amount type.
 SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
   EVT OpTy = Op.getValueType();
-  EVT ShTy = TLI->getShiftAmountTy(LHSTy);
+  EVT ShTy = TLI->getShiftAmountTy(LHSTy, getDataLayout());
   if (OpTy == ShTy || OpTy.isVector()) return Op;
 
   ISD::NodeType Opcode = OpTy.bitsGT(ShTy) ?  ISD::TRUNCATE : ISD::ZERO_EXTEND;
@@ -1864,10 +1863,10 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
   unsigned ByteSize = VT.getStoreSize();
   Type *Ty = VT.getTypeForEVT(*getContext());
   unsigned StackAlign =
-  std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty), minAlign);
+      std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign);
 
   int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
-  return getFrameIndex(FrameIdx, TLI->getPointerTy());
+  return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout()));
 }
 
 /// CreateStackTemporary - Create a stack temporary suitable for holding
@@ -1877,13 +1876,13 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
                             VT2.getStoreSizeInBits())/8;
   Type *Ty1 = VT1.getTypeForEVT(*getContext());
   Type *Ty2 = VT2.getTypeForEVT(*getContext());
-  const DataLayout *TD = TLI->getDataLayout();
-  unsigned Align = std::max(TD->getPrefTypeAlignment(Ty1),
-                            TD->getPrefTypeAlignment(Ty2));
+  const DataLayout &DL = getDataLayout();
+  unsigned Align =
+      std::max(DL.getPrefTypeAlignment(Ty1), DL.getPrefTypeAlignment(Ty2));
 
   MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(Bytes, Align, false);
-  return getFrameIndex(FrameIdx, TLI->getPointerTy());
+  return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout()));
 }
 
 SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
@@ -1916,9 +1915,9 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
     break;
   }
 
-  if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode())) {
+  if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2)) {
     const APInt &C2 = N2C->getAPIntValue();
-    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
       const APInt &C1 = N1C->getAPIntValue();
 
       switch (Cond) {
@@ -1936,8 +1935,8 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
       }
     }
   }
-  if (ConstantFPSDNode *N1C = dyn_cast<ConstantFPSDNode>(N1.getNode())) {
-    if (ConstantFPSDNode *N2C = dyn_cast<ConstantFPSDNode>(N2.getNode())) {
+  if (ConstantFPSDNode *N1C = dyn_cast<ConstantFPSDNode>(N1)) {
+    if (ConstantFPSDNode *N2C = dyn_cast<ConstantFPSDNode>(N2)) {
       APFloat::cmpResult R = N1C->getValueAPF().compare(N2C->getValueAPF());
       switch (Cond) {
       default: break;
@@ -2356,15 +2355,24 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     // Output known-0 bits are known if clear or set in both the low clear bits
     // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
     // low 3 bits clear.
+    // Output known-0 bits are also known if the top bits of each input are
+    // known to be clear. For example, if one input has the top 10 bits clear
+    // and the other has the top 8 bits clear, we know the top 7 bits of the
+    // output must be clear.
     computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
-    unsigned KnownZeroOut = KnownZero2.countTrailingOnes();
+    unsigned KnownZeroHigh = KnownZero2.countLeadingOnes();
+    unsigned KnownZeroLow = KnownZero2.countTrailingOnes();
 
     computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
-    KnownZeroOut = std::min(KnownZeroOut,
+    KnownZeroHigh = std::min(KnownZeroHigh,
+                             KnownZero2.countLeadingOnes());
+    KnownZeroLow = std::min(KnownZeroLow,
                             KnownZero2.countTrailingOnes());
 
     if (Op.getOpcode() == ISD::ADD) {
-      KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroOut);
+      KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroLow);
+      if (KnownZeroHigh > 1)
+        KnownZero |= APInt::getHighBitsSet(BitWidth, KnownZeroHigh - 1);
       break;
     }
 
@@ -2372,8 +2380,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     // information if we know (at least) that the low two bits are clear.  We
     // then return to the caller that the low bit is unknown but that other bits
     // are known zero.
-    if (KnownZeroOut >= 2) // ADDE
-      KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroOut);
+    if (KnownZeroLow >= 2) // ADDE
+      KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroLow);
     break;
   }
   case ISD::SREM:
@@ -2814,7 +2822,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
   // doesn't create new constants with different values. Nevertheless, the
   // opaque flag is preserved during folding to prevent future folding with
   // other constants.
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand.getNode())) {
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand)) {
     const APInt &Val = C->getAPIntValue();
     switch (Opcode) {
     default: break;
@@ -2861,7 +2869,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
   }
 
   // Constant fold unary operations with a floating point constant operand.
-  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand.getNode())) {
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand)) {
     APFloat V = C->getValueAPF();    // make copy
     switch (Opcode) {
     case ISD::FNEG:
@@ -2922,7 +2930,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
   }
 
   // Constant fold unary operations with a vector integer or float operand.
-  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Operand.getNode())) {
+  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Operand)) {
     if (BV->isConstant()) {
       switch (Opcode) {
       default:
@@ -3278,8 +3286,8 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, SDLoc DL, EVT VT,
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
                               SDValue N2, const SDNodeFlags *Flags) {
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
-  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
   switch (Opcode) {
   default: break;
   case ISD::TokenFactor:
@@ -3499,7 +3507,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
           Ops.push_back(Op);
           continue;
         }
-        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getNode())) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
           APInt Val = C->getAPIntValue();
           Ops.push_back(SignExtendInReg(Val));
           continue;
@@ -3554,7 +3562,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
       // if the indices are known different, extract the element from
       // the original vector.
       SDValue N1Op2 = N1.getOperand(2);
-      ConstantSDNode *N1Op2C = dyn_cast<ConstantSDNode>(N1Op2.getNode());
+      ConstantSDNode *N1Op2C = dyn_cast<ConstantSDNode>(N1Op2);
 
       if (N1Op2C && N2C) {
         if (N1Op2C->getZExtValue() == N2C->getZExtValue()) {
@@ -3600,9 +3608,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
       assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
              "Extract subvector must be from larger vector to smaller vector!");
 
-      if (isa<ConstantSDNode>(Index.getNode())) {
+      if (isa<ConstantSDNode>(Index)) {
         assert((VT.getVectorNumElements() +
-                cast<ConstantSDNode>(Index.getNode())->getZExtValue()
+                cast<ConstantSDNode>(Index)->getZExtValue()
                 <= N1.getValueType().getVectorNumElements())
                && "Extract subvector overflow!");
       }
@@ -3628,8 +3636,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
 
   // Constant fold FP operations.
   bool HasFPExceptions = TLI->hasFloatingPointExceptions();
-  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1.getNode());
-  ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2.getNode());
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
   if (N1CFP) {
     if (!N2CFP && isCommutativeBinOp(Opcode)) {
       // Canonicalize constant to RHS if commutative.
@@ -3787,7 +3795,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3) {
   // Perform various simplifications.
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   switch (Opcode) {
   case ISD::FMA: {
     ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
@@ -3845,9 +3853,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
              "Dest and insert subvector source types must match!");
       assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
              "Insert subvector must be from smaller vector to larger vector!");
-      if (isa<ConstantSDNode>(Index.getNode())) {
+      if (isa<ConstantSDNode>(Index)) {
         assert((N2.getValueType().getVectorNumElements() +
-                cast<ConstantSDNode>(Index.getNode())->getZExtValue()
+                cast<ConstantSDNode>(Index)->getZExtValue()
                 <= VT.getVectorNumElements())
                && "Insert subvector overflow!");
       }
@@ -3994,7 +4002,7 @@ static SDValue getMemsetStringVal(EVT VT, SDLoc dl, SelectionDAG &DAG,
   unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size()));
 
   APInt Val(NumVTBits, 0);
-  if (TLI.isLittleEndian()) {
+  if (DAG.getDataLayout().isLittleEndian()) {
     for (unsigned i = 0; i != NumBytes; ++i)
       Val |= (uint64_t)(unsigned char)Str[i] << i*8;
   } else {
@@ -4066,9 +4074,9 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
 
   if (VT == MVT::Other) {
     unsigned AS = 0;
-    if (DstAlign >= TLI.getDataLayout()->getPointerPrefAlignment(AS) ||
+    if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(AS) ||
         TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign)) {
-      VT = TLI.getPointerTy();
+      VT = TLI.getPointerTy(DAG.getDataLayout());
     } else {
       switch (DstAlign & 7) {
       case 0:  VT = MVT::i64; break;
@@ -4185,14 +4193,14 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
 
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
-    unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty);
+    unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->needsStackRealignment(MF))
-       while (NewAlign > Align &&
-             TLI.getDataLayout()->exceedsNaturalStackAlignment(NewAlign))
+      while (NewAlign > Align &&
+             DAG.getDataLayout().exceedsNaturalStackAlignment(NewAlign))
           NewAlign /= 2;
 
     if (NewAlign > Align) {
@@ -4294,7 +4302,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
 
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
-    unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty);
+    unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
@@ -4385,7 +4393,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl,
 
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
-    unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty);
+    unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
@@ -4488,19 +4496,21 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  Entry.Ty = TLI->getDataLayout()->getIntPtrType(*getContext());
+  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
   Entry.Node = Dst; Args.push_back(Entry);
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
   // FIXME: pass in SDLoc
   TargetLowering::CallLoweringInfo CLI(*this);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
-               Type::getVoidTy(*getContext()),
-               getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
-                                 TLI->getPointerTy()), std::move(Args), 0)
-    .setDiscardResult()
-    .setTailCall(isTailCall);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+                 Type::getVoidTy(*getContext()),
+                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
+                                   TLI->getPointerTy(getDataLayout())),
+                 std::move(Args), 0)
+      .setDiscardResult()
+      .setTailCall(isTailCall);
 
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
@@ -4544,19 +4554,21 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst,
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  Entry.Ty = TLI->getDataLayout()->getIntPtrType(*getContext());
+  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
   Entry.Node = Dst; Args.push_back(Entry);
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
   // FIXME:  pass in SDLoc
   TargetLowering::CallLoweringInfo CLI(*this);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
-               Type::getVoidTy(*getContext()),
-               getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
-                                 TLI->getPointerTy()), std::move(Args), 0)
-    .setDiscardResult()
-    .setTailCall(isTailCall);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
+                 Type::getVoidTy(*getContext()),
+                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
+                                   TLI->getPointerTy(getDataLayout())),
+                 std::move(Args), 0)
+      .setDiscardResult()
+      .setTailCall(isTailCall);
 
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
@@ -4594,7 +4606,7 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
   }
 
   // Emit a library call.
-  Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(*getContext());
+  Type *IntPtrTy = getDataLayout().getIntPtrType(*getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Node = Dst; Entry.Ty = IntPtrTy;
@@ -4608,13 +4620,15 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
 
   // FIXME: pass in SDLoc
   TargetLowering::CallLoweringInfo CLI(*this);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
-               Type::getVoidTy(*getContext()),
-               getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
-                                 TLI->getPointerTy()), std::move(Args), 0)
-    .setDiscardResult()
-    .setTailCall(isTailCall);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
+                 Type::getVoidTy(*getContext()),
+                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
+                                   TLI->getPointerTy(getDataLayout())),
+                 std::move(Args), 0)
+      .setDiscardResult()
+      .setTailCall(isTailCall);
 
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
@@ -6656,7 +6670,7 @@ bool SDNode::hasAnyUseOfValue(unsigned Value) const {
 
 /// isOnlyUserOf - Return true if this node is the only use of N.
 ///
-bool SDNode::isOnlyUserOf(SDNode *N) const {
+bool SDNode::isOnlyUserOf(const SDNode *N) const {
   bool Seen = false;
   for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
     SDNode *User = *I;
@@ -6671,16 +6685,16 @@ bool SDNode::isOnlyUserOf(SDNode *N) const {
 
 /// isOperand - Return true if this node is an operand of N.
 ///
-bool SDValue::isOperandOf(SDNode *N) const {
+bool SDValue::isOperandOf(const SDNode *N) const {
   for (const SDValue &Op : N->op_values())
     if (*this == Op)
       return true;
   return false;
 }
 
-bool SDNode::isOperandOf(SDNode *N) const {
-  for (unsigned i = 0, e = N->NumOperands; i != e; ++i)
-    if (this == N->OperandList[i].getNode())
+bool SDNode::isOperandOf(const SDNode *N) const {
+  for (const SDValue &Op : N->op_values())
+    if (this == Op.getNode())
       return true;
   return false;
 }
@@ -6784,10 +6798,9 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
       if (OperandVT.isVector()) {
         // A vector operand; extract a single element.
         EVT OperandEltVT = OperandVT.getVectorElementType();
-        Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                              OperandEltVT,
-                              Operand,
-                              getConstant(i, dl, TLI->getVectorIdxTy()));
+        Operands[j] =
+            getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand,
+                    getConstant(i, dl, TLI->getVectorIdxTy(getDataLayout())));
       } else {
         // A scalar operand; just use it as is.
         Operands[j] = Operand;
@@ -6891,10 +6904,10 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   const GlobalValue *GV;
   int64_t GVOffset = 0;
   if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
-    unsigned PtrWidth = TLI->getPointerTypeSizeInBits(GV->getType());
+    unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
     APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0);
     llvm::computeKnownBits(const_cast<GlobalValue *>(GV), KnownZero, KnownOne,
-                           *TLI->getDataLayout());
+                           getDataLayout());
     unsigned AlignBits = KnownZero.countTrailingOnes();
     unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
     if (Align)
@@ -6950,10 +6963,10 @@ SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
          "More vector elements requested than available!");
   SDValue Lo, Hi;
   Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
-               getConstant(0, DL, TLI->getVectorIdxTy()));
+               getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout())));
   Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
                getConstant(LoVT.getVectorNumElements(), DL,
-                           TLI->getVectorIdxTy()));
+                           TLI->getVectorIdxTy(getDataLayout())));
   return std::make_pair(Lo, Hi);
 }
 
@@ -6965,7 +6978,7 @@ void SelectionDAG::ExtractVectorElements(SDValue Op,
     Count = VT.getVectorNumElements();
 
   EVT EltVT = VT.getVectorElementType();
-  EVT IdxTy = TLI->getVectorIdxTy();
+  EVT IdxTy = TLI->getVectorIdxTy(getDataLayout());
   SDLoc SL(Op);
   for (unsigned i = Start, e = Start + Count; i != e; ++i) {
     Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
@@ -7080,14 +7093,12 @@ SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
 
 ConstantSDNode *
 BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const {
-  return dyn_cast_or_null<ConstantSDNode>(
-      getSplatValue(UndefElements).getNode());
+  return dyn_cast_or_null<ConstantSDNode>(getSplatValue(UndefElements));
 }
 
 ConstantFPSDNode *
 BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const {
-  return dyn_cast_or_null<ConstantFPSDNode>(
-      getSplatValue(UndefElements).getNode());
+  return dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements));
 }
 
 bool BuildVectorSDNode::isConstant() const {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 4897082f89aa..2c3c0eb101a0 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -146,7 +146,7 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL,
         Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]);
       }
 
-      if (TLI.isBigEndian())
+      if (DAG.getDataLayout().isBigEndian())
         std::swap(Lo, Hi);
 
       Val = DAG.getNode(ISD::BUILD_PAIR, DL, RoundVT, Lo, Hi);
@@ -160,13 +160,14 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL,
 
         // Combine the round and odd parts.
         Lo = Val;
-        if (TLI.isBigEndian())
+        if (DAG.getDataLayout().isBigEndian())
           std::swap(Lo, Hi);
         EVT TotalVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
         Hi = DAG.getNode(ISD::ANY_EXTEND, DL, TotalVT, Hi);
-        Hi = DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
-                         DAG.getConstant(Lo.getValueType().getSizeInBits(), DL,
-                                         TLI.getPointerTy()));
+        Hi =
+            DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
+                        DAG.getConstant(Lo.getValueType().getSizeInBits(), DL,
+                                        TLI.getPointerTy(DAG.getDataLayout())));
         Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, TotalVT, Lo);
         Val = DAG.getNode(ISD::OR, DL, TotalVT, Lo, Hi);
       }
@@ -177,7 +178,7 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL,
       SDValue Lo, Hi;
       Lo = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[0]);
       Hi = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[1]);
-      if (TLI.hasBigEndianPartOrdering(ValueVT))
+      if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
         std::swap(Lo, Hi);
       Val = DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Lo, Hi);
     } else {
@@ -211,8 +212,9 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL,
   if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
     // FP_ROUND's are always exact here.
     if (ValueVT.bitsLT(Val.getValueType()))
-      return DAG.getNode(ISD::FP_ROUND, DL, ValueVT, Val,
-                         DAG.getTargetConstant(1, DL, TLI.getPointerTy()));
+      return DAG.getNode(
+          ISD::FP_ROUND, DL, ValueVT, Val,
+          DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())));
 
     return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val);
   }
@@ -305,8 +307,9 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL,
     if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) {
       assert(PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements() &&
              "Cannot narrow, it would be a lossy transformation");
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
-                         DAG.getConstant(0, DL, TLI.getVectorIdxTy()));
+      return DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
+          DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
     }
 
     // Vector/Vector bitcast.
@@ -362,10 +365,10 @@ static void getCopyToParts(SelectionDAG &DAG, SDLoc DL,
   if (ValueVT.isVector())
     return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V);
 
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned PartBits = PartVT.getSizeInBits();
   unsigned OrigNumParts = NumParts;
-  assert(TLI.isTypeLegal(PartVT) && "Copying to an illegal type!");
+  assert(DAG.getTargetLoweringInfo().isTypeLegal(PartVT) &&
+         "Copying to an illegal type!");
 
   if (NumParts == 0)
     return;
@@ -433,7 +436,7 @@ static void getCopyToParts(SelectionDAG &DAG, SDLoc DL,
                                  DAG.getIntPtrConstant(RoundBits, DL));
     getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT, V);
 
-    if (TLI.isBigEndian())
+    if (DAG.getDataLayout().isBigEndian())
       // The odd parts were reversed by getCopyToParts - unreverse them.
       std::reverse(Parts + RoundParts, Parts + NumParts);
 
@@ -468,7 +471,7 @@ static void getCopyToParts(SelectionDAG &DAG, SDLoc DL,
     }
   }
 
-  if (TLI.isBigEndian())
+  if (DAG.getDataLayout().isBigEndian())
     std::reverse(Parts, Parts + OrigNumParts);
 }
 
@@ -497,9 +500,9 @@ static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL,
       // undef elements.
       SmallVector<SDValue, 16> Ops;
       for (unsigned i = 0, e = ValueVT.getVectorNumElements(); i != e; ++i)
-        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                                  ElementVT, Val, DAG.getConstant(i, DL,
-                                                  TLI.getVectorIdxTy())));
+        Ops.push_back(DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, DL, ElementVT, Val,
+            DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))));
 
       for (unsigned i = ValueVT.getVectorNumElements(),
            e = PartVT.getVectorNumElements(); i != e; ++i)
@@ -524,9 +527,9 @@ static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL,
       // Vector -> scalar conversion.
       assert(ValueVT.getVectorNumElements() == 1 &&
              "Only trivial vector-to-scalar conversions should get here!");
-      Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                        PartVT, Val,
-                        DAG.getConstant(0, DL, TLI.getVectorIdxTy()));
+      Val = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
+          DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
       bool Smaller = ValueVT.bitsLE(PartVT);
       Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
@@ -554,14 +557,14 @@ static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL,
   SmallVector<SDValue, 8> Ops(NumIntermediates);
   for (unsigned i = 0; i != NumIntermediates; ++i) {
     if (IntermediateVT.isVector())
-      Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
-                           IntermediateVT, Val,
-                   DAG.getConstant(i * (NumElements / NumIntermediates), DL,
-                                   TLI.getVectorIdxTy()));
+      Ops[i] =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
+                      DAG.getConstant(i * (NumElements / NumIntermediates), DL,
+                                      TLI.getVectorIdxTy(DAG.getDataLayout())));
     else
-      Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                           IntermediateVT, Val,
-                           DAG.getConstant(i, DL, TLI.getVectorIdxTy()));
+      Ops[i] = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val,
+          DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
 
   // Split the intermediate operands into legal parts.
@@ -588,14 +591,14 @@ RegsForValue::RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt,
                            EVT valuevt)
     : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
 
-RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &tli,
-                           unsigned Reg, Type *Ty) {
-  ComputeValueVTs(tli, Ty, ValueVTs);
+RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
+                           const DataLayout &DL, unsigned Reg, Type *Ty) {
+  ComputeValueVTs(TLI, DL, Ty, ValueVTs);
 
   for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
     EVT ValueVT = ValueVTs[Value];
-    unsigned NumRegs = tli.getNumRegisters(Context, ValueVT);
-    MVT RegisterVT = tli.getRegisterType(Context, ValueVT);
+    unsigned NumRegs = TLI.getNumRegisters(Context, ValueVT);
+    MVT RegisterVT = TLI.getRegisterType(Context, ValueVT);
     for (unsigned i = 0; i != NumRegs; ++i)
       Regs.push_back(Reg + i);
     RegVTs.push_back(RegisterVT);
@@ -796,7 +799,7 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
       if (TheReg == SP && Code == InlineAsm::Kind_Clobber) {
         // If we clobbered the stack pointer, MFI should know about it.
         assert(DAG.getMachineFunction().getFrameInfo()->
-            hasInlineAsmWithSPAdjust());
+            hasOpaqueSPAdjustment());
       }
     }
   }
@@ -807,7 +810,7 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
   AA = &aa;
   GFI = gfi;
   LibInfo = li;
-  DL = DAG.getTarget().getDataLayout();
+  DL = &DAG.getDataLayout();
   Context = DAG.getContext();
   LPadToCallSiteMap.clear();
 }
@@ -964,8 +967,8 @@ SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) {
 
   if (It != FuncInfo.ValueMap.end()) {
     unsigned InReg = It->second;
-    RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(), InReg,
-                     Ty);
+    RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                     DAG.getDataLayout(), InReg, Ty);
     SDValue Chain = DAG.getEntryNode();
     Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
     resolveDanglingDebugInfo(V, Result);
@@ -1031,7 +1034,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (const Constant *C = dyn_cast<Constant>(V)) {
-    EVT VT = TLI.getValueType(V->getType(), true);
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), V->getType(), true);
 
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
       return DAG.getConstant(*CI, getCurSDLoc(), VT);
@@ -1041,7 +1044,8 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
 
     if (isa<ConstantPointerNull>(C)) {
       unsigned AS = V->getType()->getPointerAddressSpace();
-      return DAG.getConstant(0, getCurSDLoc(), TLI.getPointerTy(AS));
+      return DAG.getConstant(0, getCurSDLoc(),
+                             TLI.getPointerTy(DAG.getDataLayout(), AS));
     }
 
     if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
@@ -1095,7 +1099,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
              "Unknown struct or array constant!");
 
       SmallVector<EVT, 4> ValueVTs;
-      ComputeValueVTs(TLI, C->getType(), ValueVTs);
+      ComputeValueVTs(TLI, DAG.getDataLayout(), C->getType(), ValueVTs);
       unsigned NumElts = ValueVTs.size();
       if (NumElts == 0)
         return SDValue(); // empty struct
@@ -1127,7 +1131,8 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
         Ops.push_back(getValue(CV->getOperand(i)));
     } else {
       assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
-      EVT EltVT = TLI.getValueType(VecTy->getElementType());
+      EVT EltVT =
+          TLI.getValueType(DAG.getDataLayout(), VecTy->getElementType());
 
       SDValue Op;
       if (EltVT.isFloatingPoint())
@@ -1147,13 +1152,15 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     DenseMap<const AllocaInst*, int>::iterator SI =
       FuncInfo.StaticAllocaMap.find(AI);
     if (SI != FuncInfo.StaticAllocaMap.end())
-      return DAG.getFrameIndex(SI->second, TLI.getPointerTy());
+      return DAG.getFrameIndex(SI->second,
+                               TLI.getPointerTy(DAG.getDataLayout()));
   }
 
   // If this is an instruction which fast-isel has deferred, select it now.
   if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
     unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
-    RegsForValue RFV(*DAG.getContext(), TLI, InReg, Inst->getType());
+    RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
+                     Inst->getType());
     SDValue Chain = DAG.getEntryNode();
     return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
@@ -1163,6 +1170,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
 
 void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto &DL = DAG.getDataLayout();
   SDValue Chain = getControlRoot();
   SmallVector<ISD::OutputArg, 8> Outs;
   SmallVector<SDValue, 8> OutVals;
@@ -1175,7 +1183,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     // Leave Outs empty so that LowerReturn won't try to load return
     // registers the usual way.
     SmallVector<EVT, 1> PtrValueVTs;
-    ComputeValueVTs(TLI, PointerType::getUnqual(F->getReturnType()),
+    ComputeValueVTs(TLI, DL, PointerType::getUnqual(F->getReturnType()),
                     PtrValueVTs);
 
     SDValue RetPtr = DAG.getRegister(DemoteReg, PtrValueVTs[0]);
@@ -1183,7 +1191,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
 
     SmallVector<EVT, 4> ValueVTs;
     SmallVector<uint64_t, 4> Offsets;
-    ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs, &Offsets);
+    ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs, &Offsets);
     unsigned NumValues = ValueVTs.size();
 
     SmallVector<SDValue, 4> Chains(NumValues);
@@ -1203,7 +1211,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
                         MVT::Other, Chains);
   } else if (I.getNumOperands() != 0) {
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs);
+    ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
     if (NumValues) {
       SDValue RetOp = getValue(I.getOperand(0));
@@ -1692,7 +1700,7 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
 void SelectionDAGBuilder::visitJumpTable(JumpTable &JT) {
   // Emit the code for the jump table
   assert(JT.Reg != -1U && "Should lower JT Header first!");
-  EVT PTy = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PTy = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
                                      JT.Reg, PTy);
   SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
@@ -1723,9 +1731,10 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
   // This value may be smaller or larger than the target's pointer type, and
   // therefore require extension or truncating.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SwitchOp = DAG.getZExtOrTrunc(Sub, dl, TLI.getPointerTy());
+  SwitchOp = DAG.getZExtOrTrunc(Sub, dl, TLI.getPointerTy(DAG.getDataLayout()));
 
-  unsigned JumpTableReg = FuncInfo.CreateReg(TLI.getPointerTy());
+  unsigned JumpTableReg =
+      FuncInfo.CreateReg(TLI.getPointerTy(DAG.getDataLayout()));
   SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl,
                                     JumpTableReg, SwitchOp);
   JT.Reg = JumpTableReg;
@@ -1733,11 +1742,10 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
   // Emit the range check for the jump table, and branch to the default block
   // for the switch statement if the value being switched on exceeds the largest
   // case in the switch.
-  SDValue CMP =
-      DAG.getSetCC(dl, TLI.getSetCCResultType(*DAG.getContext(),
-                                              Sub.getValueType()),
-                   Sub, DAG.getConstant(JTH.Last - JTH.First, dl, VT),
-                   ISD::SETUGT);
+  SDValue CMP = DAG.getSetCC(
+      dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                 Sub.getValueType()),
+      Sub, DAG.getConstant(JTH.Last - JTH.First, dl, VT), ISD::SETUGT);
 
   SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
                                MVT::Other, CopyTo, CMP,
@@ -1762,7 +1770,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
 
   // First create the loads to the guard/stack slot for the comparison.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT PtrTy = TLI.getPointerTy();
+  EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
 
   MachineFrameInfo *MFI = ParentBB->getParent()->getFrameInfo();
   int FI = MFI->getStackProtectorIndex();
@@ -1771,8 +1779,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   SDValue GuardPtr = getValue(IRGuard);
   SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
 
-  unsigned Align =
-    TLI.getDataLayout()->getPrefTypeAlignment(IRGuard->getType());
+  unsigned Align = DL->getPrefTypeAlignment(IRGuard->getType());
 
   SDValue Guard;
   SDLoc dl = getCurSDLoc();
@@ -1799,10 +1806,10 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   EVT VT = Guard.getValueType();
   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Guard, StackSlot);
 
-  SDValue Cmp =
-      DAG.getSetCC(dl, TLI.getSetCCResultType(*DAG.getContext(),
-                                                         Sub.getValueType()),
-                   Sub, DAG.getConstant(0, dl, VT), ISD::SETNE);
+  SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(DAG.getDataLayout(),
+                                                        *DAG.getContext(),
+                                                        Sub.getValueType()),
+                             Sub, DAG.getConstant(0, dl, VT), ISD::SETNE);
 
   // If the sub is not 0, then we know the guard/stackslot do not equal, so
   // branch to failure MBB.
@@ -1848,10 +1855,10 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
 
   // Check range
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue RangeCmp =
-      DAG.getSetCC(dl, TLI.getSetCCResultType(*DAG.getContext(),
-                                              Sub.getValueType()),
-                   Sub, DAG.getConstant(B.Range, dl, VT), ISD::SETUGT);
+  SDValue RangeCmp = DAG.getSetCC(
+      dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                 Sub.getValueType()),
+      Sub, DAG.getConstant(B.Range, dl, VT), ISD::SETUGT);
 
   // Determine the type of the test operands.
   bool UsePtrType = false;
@@ -1867,7 +1874,7 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
       }
   }
   if (UsePtrType) {
-    VT = TLI.getPointerTy();
+    VT = TLI.getPointerTy(DAG.getDataLayout());
     Sub = DAG.getZExtOrTrunc(Sub, dl, VT);
   }
 
@@ -1909,13 +1916,15 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
     // Testing for a single bit; just compare the shift count with what it
     // would need to be to shift a 1 bit in that position.
     Cmp = DAG.getSetCC(
-        dl, TLI.getSetCCResultType(*DAG.getContext(), VT), ShiftOp,
-        DAG.getConstant(countTrailingZeros(B.Mask), dl, VT), ISD::SETEQ);
+        dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
+        ShiftOp, DAG.getConstant(countTrailingZeros(B.Mask), dl, VT),
+        ISD::SETEQ);
   } else if (PopCount == BB.Range) {
     // There is only one zero bit in the range, test for it directly.
     Cmp = DAG.getSetCC(
-        dl, TLI.getSetCCResultType(*DAG.getContext(), VT), ShiftOp,
-        DAG.getConstant(countTrailingOnes(B.Mask), dl, VT), ISD::SETNE);
+        dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
+        ShiftOp, DAG.getConstant(countTrailingOnes(B.Mask), dl, VT),
+        ISD::SETNE);
   } else {
     // Make desired shift
     SDValue SwitchVal = DAG.getNode(ISD::SHL, dl, VT,
@@ -1924,8 +1933,9 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
     // Emit bit tests and jumps
     SDValue AndOp = DAG.getNode(ISD::AND, dl,
                                 VT, SwitchVal, DAG.getConstant(B.Mask, dl, VT));
-    Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(*DAG.getContext(), VT), AndOp,
-                       DAG.getConstant(0, dl, VT), ISD::SETNE);
+    Cmp = DAG.getSetCC(
+        dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
+        AndOp, DAG.getConstant(0, dl, VT), ISD::SETNE);
   }
 
   // The branch weight from SwitchBB to B.TargetBB is B.ExtraWeight.
@@ -2013,7 +2023,7 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
 
   SmallVector<EVT, 2> ValueVTs;
   SDLoc dl = getCurSDLoc();
-  ComputeValueVTs(TLI, LP.getType(), ValueVTs);
+  ComputeValueVTs(TLI, DAG.getDataLayout(), LP.getType(), ValueVTs);
   assert(ValueVTs.size() == 2 && "Only two-valued landingpads are supported");
 
   // Get the two live-in registers as SDValues. The physregs have already been
@@ -2022,14 +2032,16 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
   if (FuncInfo.ExceptionPointerVirtReg) {
     Ops[0] = DAG.getZExtOrTrunc(
         DAG.getCopyFromReg(DAG.getEntryNode(), dl,
-                           FuncInfo.ExceptionPointerVirtReg, TLI.getPointerTy()),
+                           FuncInfo.ExceptionPointerVirtReg,
+                           TLI.getPointerTy(DAG.getDataLayout())),
         dl, ValueVTs[0]);
   } else {
-    Ops[0] = DAG.getConstant(0, dl, TLI.getPointerTy());
+    Ops[0] = DAG.getConstant(0, dl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   Ops[1] = DAG.getZExtOrTrunc(
       DAG.getCopyFromReg(DAG.getEntryNode(), dl,
-                         FuncInfo.ExceptionSelectorVirtReg, TLI.getPointerTy()),
+                         FuncInfo.ExceptionSelectorVirtReg,
+                         TLI.getPointerTy(DAG.getDataLayout())),
       dl, ValueVTs[1]);
 
   // Merge into one.
@@ -2038,28 +2050,6 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
   setValue(&LP, Res);
 }
 
-unsigned
-SelectionDAGBuilder::visitLandingPadClauseBB(GlobalValue *ClauseGV,
-                                             MachineBasicBlock *LPadBB) {
-  SDValue Chain = getControlRoot();
-  SDLoc dl = getCurSDLoc();
-
-  // Get the typeid that we will dispatch on later.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy());
-  unsigned VReg = FuncInfo.MF->getRegInfo().createVirtualRegister(RC);
-  unsigned TypeID = DAG.getMachineFunction().getMMI().getTypeIDFor(ClauseGV);
-  SDValue Sel = DAG.getConstant(TypeID, dl, TLI.getPointerTy());
-  Chain = DAG.getCopyToReg(Chain, dl, VReg, Sel);
-
-  // Branch to the main landing pad block.
-  MachineBasicBlock *ClauseMBB = FuncInfo.MBB;
-  ClauseMBB->addSuccessor(LPadBB);
-  DAG.setRoot(DAG.getNode(ISD::BR, dl, MVT::Other, Chain,
-                          DAG.getBasicBlock(LPadBB)));
-  return VReg;
-}
-
 void SelectionDAGBuilder::sortAndRangeify(CaseClusterVector &Clusters) {
 #ifndef NDEBUG
   for (const CaseCluster &CC : Clusters)
@@ -2186,8 +2176,8 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
 
-  EVT ShiftTy =
-      DAG.getTargetLoweringInfo().getShiftAmountTy(Op2.getValueType());
+  EVT ShiftTy = DAG.getTargetLoweringInfo().getShiftAmountTy(
+      Op2.getValueType(), DAG.getDataLayout());
 
   // Coerce the shift amount to the right type if we can.
   if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
@@ -2256,7 +2246,8 @@ void SelectionDAGBuilder::visitICmp(const User &I) {
   SDValue Op2 = getValue(I.getOperand(1));
   ISD::CondCode Opcode = getICmpCondCode(predicate);
 
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
   setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Opcode));
 }
 
@@ -2271,13 +2262,15 @@ void SelectionDAGBuilder::visitFCmp(const User &I) {
   ISD::CondCode Condition = getFCmpCondCode(predicate);
   if (TM.Options.NoNaNsFPMath)
     Condition = getFCmpCodeWithoutNaN(Condition);
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
   setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
 }
 
 void SelectionDAGBuilder::visitSelect(const User &I) {
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(DAG.getTargetLoweringInfo(), I.getType(), ValueVTs);
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(),
+                  ValueVTs);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0) return;
 
@@ -2336,7 +2329,8 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
 void SelectionDAGBuilder::visitTrunc(const User &I) {
   // TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
   setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), DestVT, N));
 }
 
@@ -2344,7 +2338,8 @@ void SelectionDAGBuilder::visitZExt(const User &I) {
   // ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
   // ZExt also can't be a cast to bool for same reason. So, nothing much to do
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
   setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
@@ -2352,7 +2347,8 @@ void SelectionDAGBuilder::visitSExt(const User &I) {
   // SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
   // SExt also can't be a cast to bool for same reason. So, nothing much to do
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
   setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
@@ -2361,43 +2357,49 @@ void SelectionDAGBuilder::visitFPTrunc(const User &I) {
   SDValue N = getValue(I.getOperand(0));
   SDLoc dl = getCurSDLoc();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT DestVT = TLI.getValueType(I.getType());
+  EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N,
-                           DAG.getTargetConstant(0, dl, TLI.getPointerTy())));
+                           DAG.getTargetConstant(
+                               0, dl, TLI.getPointerTy(DAG.getDataLayout()))));
 }
 
 void SelectionDAGBuilder::visitFPExt(const User &I) {
   // FPExt is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
   setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPToUI(const User &I) {
   // FPToUI is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
   setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPToSI(const User &I) {
   // FPToSI is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
   setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitUIToFP(const User &I) {
   // UIToFP is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
   setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitSIToFP(const User &I) {
   // SIToFP is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
   setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
 }
 
@@ -2405,7 +2407,8 @@ void SelectionDAGBuilder::visitPtrToInt(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
   setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
 }
 
@@ -2413,14 +2416,16 @@ void SelectionDAGBuilder::visitIntToPtr(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
   setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
 }
 
 void SelectionDAGBuilder::visitBitCast(const User &I) {
   SDValue N = getValue(I.getOperand(0));
   SDLoc dl = getCurSDLoc();
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
 
   // BitCast assures us that source and destination are the same size so this is
   // either a BITCAST or a no-op.
@@ -2442,7 +2447,7 @@ void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const Value *SV = I.getOperand(0);
   SDValue N = getValue(SV);
-  EVT DestVT = TLI.getValueType(I.getType());
+  EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
 
   unsigned SrcAS = SV->getType()->getPointerAddressSpace();
   unsigned DestAS = I.getType()->getPointerAddressSpace();
@@ -2457,19 +2462,21 @@ void SelectionDAGBuilder::visitInsertElement(const User &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue InVec = getValue(I.getOperand(0));
   SDValue InVal = getValue(I.getOperand(1));
-  SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(2)),
-                                     getCurSDLoc(), TLI.getVectorIdxTy());
+  SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(2)), getCurSDLoc(),
+                                     TLI.getVectorIdxTy(DAG.getDataLayout()));
   setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurSDLoc(),
-                           TLI.getValueType(I.getType()), InVec, InVal, InIdx));
+                           TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                           InVec, InVal, InIdx));
 }
 
 void SelectionDAGBuilder::visitExtractElement(const User &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue InVec = getValue(I.getOperand(0));
-  SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(1)),
-                                     getCurSDLoc(), TLI.getVectorIdxTy());
+  SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(1)), getCurSDLoc(),
+                                     TLI.getVectorIdxTy(DAG.getDataLayout()));
   setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(),
-                           TLI.getValueType(I.getType()), InVec, InIdx));
+                           TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                           InVec, InIdx));
 }
 
 // Utility for visitShuffleVector - Return true if every element in Mask,
@@ -2492,7 +2499,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   unsigned MaskNumElts = Mask.size();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT VT = TLI.getValueType(I.getType());
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   EVT SrcVT = Src1.getValueType();
   unsigned SrcNumElts = SrcVT.getVectorNumElements();
 
@@ -2614,7 +2621,8 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
           SDLoc dl = getCurSDLoc();
           Src = DAG.getNode(
               ISD::EXTRACT_SUBVECTOR, dl, VT, Src,
-              DAG.getConstant(StartIdx[Input], dl, TLI.getVectorIdxTy()));
+              DAG.getConstant(StartIdx[Input], dl,
+                              TLI.getVectorIdxTy(DAG.getDataLayout())));
         }
       }
 
@@ -2641,7 +2649,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   // replacing the shuffle with extract and build vector.
   // to insert and build vector.
   EVT EltVT = VT.getVectorElementType();
-  EVT IdxVT = TLI.getVectorIdxTy();
+  EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
   SDLoc dl = getCurSDLoc();
   SmallVector<SDValue,8> Ops;
   for (unsigned i = 0; i != MaskNumElts; ++i) {
@@ -2676,9 +2684,9 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> AggValueVTs;
-  ComputeValueVTs(TLI, AggTy, AggValueVTs);
+  ComputeValueVTs(TLI, DAG.getDataLayout(), AggTy, AggValueVTs);
   SmallVector<EVT, 4> ValValueVTs;
-  ComputeValueVTs(TLI, ValTy, ValValueVTs);
+  ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);
 
   unsigned NumAggValues = AggValueVTs.size();
   unsigned NumValValues = ValValueVTs.size();
@@ -2722,7 +2730,7 @@ void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> ValValueVTs;
-  ComputeValueVTs(TLI, ValTy, ValValueVTs);
+  ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);
 
   unsigned NumValValues = ValValueVTs.size();
 
@@ -2755,6 +2763,16 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
   SDValue N = getValue(Op0);
   SDLoc dl = getCurSDLoc();
 
+  // Normalize Vector GEP - all scalar operands should be converted to the
+  // splat vector.
+  unsigned VectorWidth = I.getType()->isVectorTy() ?
+    cast<VectorType>(I.getType())->getVectorNumElements() : 0;
+
+  if (VectorWidth && !N.getValueType().isVector()) {
+    MVT VT = MVT::getVectorVT(N.getValueType().getSimpleVT(), VectorWidth);
+    SmallVector<SDValue, 16> Ops(VectorWidth, N);
+    N = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  }
   for (GetElementPtrInst::const_op_iterator OI = I.op_begin()+1, E = I.op_end();
        OI != E; ++OI) {
     const Value *Idx = *OI;
@@ -2770,16 +2788,25 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       Ty = StTy->getElementType(Field);
     } else {
       Ty = cast<SequentialType>(Ty)->getElementType();
-      MVT PtrTy = DAG.getTargetLoweringInfo().getPointerTy(AS);
+      MVT PtrTy =
+          DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout(), AS);
       unsigned PtrSize = PtrTy.getSizeInBits();
       APInt ElementSize(PtrSize, DL->getTypeAllocSize(Ty));
 
-      // If this is a constant subscript, handle it quickly.
-      if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
+      // If this is a scalar constant or a splat vector of constants,
+      // handle it quickly.
+      const auto *CI = dyn_cast<ConstantInt>(Idx);
+      if (!CI && isa<ConstantDataVector>(Idx) &&
+          cast<ConstantDataVector>(Idx)->getSplatValue())
+        CI = cast<ConstantInt>(cast<ConstantDataVector>(Idx)->getSplatValue());
+
+      if (CI) {
         if (CI->isZero())
           continue;
         APInt Offs = ElementSize * CI->getValue().sextOrTrunc(PtrSize);
-        SDValue OffsVal = DAG.getConstant(Offs, dl, PtrTy);
+        SDValue OffsVal = VectorWidth ?
+          DAG.getConstant(Offs, dl, MVT::getVectorVT(PtrTy, VectorWidth)) :
+          DAG.getConstant(Offs, dl, PtrTy);
         N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal);
         continue;
       }
@@ -2787,6 +2814,11 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       // N = N + Idx * ElementSize;
       SDValue IdxN = getValue(Idx);
 
+      if (!IdxN.getValueType().isVector() && VectorWidth) {
+        MVT VT = MVT::getVectorVT(IdxN.getValueType().getSimpleVT(), VectorWidth);
+        SmallVector<SDValue, 16> Ops(VectorWidth, IdxN);
+        IdxN = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);      
+      }
       // If the index is smaller or larger than intptr_t, truncate or extend
       // it.
       IdxN = DAG.getSExtOrTrunc(IdxN, dl, N.getValueType());
@@ -2823,14 +2855,14 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
   SDLoc dl = getCurSDLoc();
   Type *Ty = I.getAllocatedType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
+  auto &DL = DAG.getDataLayout();
+  uint64_t TySize = DL.getTypeAllocSize(Ty);
   unsigned Align =
-      std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty),
-               I.getAlignment());
+      std::max((unsigned)DL.getPrefTypeAlignment(Ty), I.getAlignment());
 
   SDValue AllocSize = getValue(I.getArraySize());
 
-  EVT IntPtr = TLI.getPointerTy();
+  EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
   if (AllocSize.getValueType() != IntPtr)
     AllocSize = DAG.getZExtOrTrunc(AllocSize, dl, IntPtr);
 
@@ -2898,7 +2930,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(TLI, Ty, ValueVTs, &Offsets);
+  ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &Offsets);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0)
     return;
@@ -2975,8 +3007,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
 
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(DAG.getTargetLoweringInfo(), SrcV->getType(),
-                  ValueVTs, &Offsets);
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
+                  SrcV->getType(), ValueVTs, &Offsets);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0)
     return;
@@ -3077,9 +3109,10 @@ static bool getUniformBase(Value *& Ptr, SDValue& Base, SDValue& Index,
   else if (SDB->findValue(ShuffleInst)) {
     SDValue ShuffleNode = SDB->getValue(ShuffleInst);
     SDLoc sdl = ShuffleNode;
-    Base = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl,
-                       ShuffleNode.getValueType().getScalarType(), ShuffleNode,
-                       DAG.getConstant(0, sdl, TLI.getVectorIdxTy()));
+    Base = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, sdl,
+        ShuffleNode.getValueType().getScalarType(), ShuffleNode,
+        DAG.getConstant(0, sdl, TLI.getVectorIdxTy(DAG.getDataLayout())));
     SDB->setValue(Ptr, Base);
   }
   else
@@ -3126,7 +3159,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
                          MachineMemOperand::MOStore,  VT.getStoreSize(),
                          Alignment, AAInfo);
   if (!UniformBase) {
-    Base = DAG.getTargetConstant(0, sdl, TLI.getPointerTy());
+    Base = DAG.getTargetConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
   }
   SDValue Ops[] = { getRoot(), Src0, Mask, Base, Index };
@@ -3146,7 +3179,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
   SDValue Mask = getValue(I.getArgOperand(2));
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT VT = TLI.getValueType(I.getType());
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
   if (!Alignment)
     Alignment = DAG.getEVTAlignment(VT);
@@ -3184,7 +3217,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDValue Mask = getValue(I.getArgOperand(2));
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT VT = TLI.getValueType(I.getType());
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
   if (!Alignment)
     Alignment = DAG.getEVTAlignment(VT);
@@ -3214,7 +3247,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
                          Alignment, AAInfo, Ranges);
 
   if (!UniformBase) {
-    Base = DAG.getTargetConstant(0, sdl, TLI.getPointerTy());
+    Base = DAG.getTargetConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
   }
   SDValue Ops[] = { Root, Src0, Mask, Base, Index };
@@ -3291,8 +3324,10 @@ void SelectionDAGBuilder::visitFence(const FenceInst &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Ops[3];
   Ops[0] = getRoot();
-  Ops[1] = DAG.getConstant(I.getOrdering(), dl, TLI.getPointerTy());
-  Ops[2] = DAG.getConstant(I.getSynchScope(), dl, TLI.getPointerTy());
+  Ops[1] = DAG.getConstant(I.getOrdering(), dl,
+                           TLI.getPointerTy(DAG.getDataLayout()));
+  Ops[2] = DAG.getConstant(I.getSynchScope(), dl,
+                           TLI.getPointerTy(DAG.getDataLayout()));
   DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops));
 }
 
@@ -3304,7 +3339,7 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
   SDValue InChain = getRoot();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT VT = TLI.getValueType(I.getType());
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
 
   if (I.getAlignment() < VT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic load");
@@ -3339,7 +3374,8 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
   SDValue InChain = getRoot();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT VT = TLI.getValueType(I.getValueOperand()->getType());
+  EVT VT =
+      TLI.getValueType(DAG.getDataLayout(), I.getValueOperand()->getType());
 
   if (I.getAlignment() < VT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic store");
@@ -3382,7 +3418,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
       Info.opc == ISD::INTRINSIC_W_CHAIN)
     Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(),
-                                        TLI.getPointerTy()));
+                                        TLI.getPointerTy(DAG.getDataLayout())));
 
   // Add all operands of the call to the operand list.
   for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
@@ -3391,7 +3427,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   }
 
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, I.getType(), ValueVTs);
+  ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
 
   if (HasChain)
     ValueVTs.push_back(MVT::Other);
@@ -3425,7 +3461,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
 
   if (!I.getType()->isVoidTy()) {
     if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
-      EVT VT = TLI.getValueType(PTy);
+      EVT VT = TLI.getValueType(DAG.getDataLayout(), PTy);
       Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);
     }
 
@@ -3458,8 +3494,9 @@ GetExponent(SelectionDAG &DAG, SDValue Op, const TargetLowering &TLI,
             SDLoc dl) {
   SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
                            DAG.getConstant(0x7f800000, dl, MVT::i32));
-  SDValue t1 = DAG.getNode(ISD::SRL, dl, MVT::i32, t0,
-                           DAG.getConstant(23, dl, TLI.getPointerTy()));
+  SDValue t1 = DAG.getNode(
+      ISD::SRL, dl, MVT::i32, t0,
+      DAG.getConstant(23, dl, TLI.getPointerTy(DAG.getDataLayout())));
   SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1,
                            DAG.getConstant(127, dl, MVT::i32));
   return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2);
@@ -3484,7 +3521,8 @@ static SDValue getLimitedPrecisionExp2(SDValue t0, SDLoc dl,
   //   IntegerPartOfX <<= 23;
   IntegerPartOfX = DAG.getNode(
       ISD::SHL, dl, MVT::i32, IntegerPartOfX,
-      DAG.getConstant(23, dl, DAG.getTargetLoweringInfo().getPointerTy()));
+      DAG.getConstant(23, dl, DAG.getTargetLoweringInfo().getPointerTy(
+                                  DAG.getDataLayout())));
 
   SDValue TwoToFractionalPartOfX;
   if (LimitFloatPrecision <= 6) {
@@ -4071,11 +4109,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::vaend:    visitVAEnd(I); return nullptr;
   case Intrinsic::vacopy:   visitVACopy(I); return nullptr;
   case Intrinsic::returnaddress:
-    setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl, TLI.getPointerTy(),
+    setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl,
+                             TLI.getPointerTy(DAG.getDataLayout()),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::frameaddress:
-    setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl, TLI.getPointerTy(),
+    setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
+                             TLI.getPointerTy(DAG.getDataLayout()),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::read_register: {
@@ -4083,7 +4123,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Chain = getRoot();
     SDValue RegName =
         DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
-    EVT VT = TLI.getValueType(I.getType());
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     Res = DAG.getNode(ISD::READ_REGISTER, sdl,
       DAG.getVTList(VT, MVT::Other), Chain, RegName);
     setValue(&I, Res);
@@ -4335,14 +4375,15 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   case Intrinsic::eh_dwarf_cfa: {
     SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), sdl,
-                                        TLI.getPointerTy());
+                                        TLI.getPointerTy(DAG.getDataLayout()));
     SDValue Offset = DAG.getNode(ISD::ADD, sdl,
                                  CfaArg.getValueType(),
                                  DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, sdl,
                                              CfaArg.getValueType()),
                                  CfaArg);
-    SDValue FA = DAG.getNode(ISD::FRAMEADDR, sdl, TLI.getPointerTy(),
-                             DAG.getConstant(0, sdl, TLI.getPointerTy()));
+    SDValue FA = DAG.getNode(
+        ISD::FRAMEADDR, sdl, TLI.getPointerTy(DAG.getDataLayout()),
+        DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout())));
     setValue(&I, DAG.getNode(ISD::ADD, sdl, FA.getValueType(),
                              FA, Offset));
     return nullptr;
@@ -4444,7 +4485,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     ShOps[0] = ShAmt;
     ShOps[1] = DAG.getConstant(0, sdl, MVT::i32);
     ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, sdl, ShAmtVT, ShOps);
-    EVT DestVT = TLI.getValueType(I.getType());
+    EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
     Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
                        DAG.getConstant(NewIntrinsic, sdl, MVT::i32),
@@ -4474,7 +4515,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     case Intrinsic::convertus:  Code = ISD::CVT_US; break;
     case Intrinsic::convertuu:  Code = ISD::CVT_UU; break;
     }
-    EVT DestVT = TLI.getValueType(I.getType());
+    EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     const Value *Op1 = I.getArgOperand(0);
     Res = DAG.getConvertRndSat(DestVT, sdl, getValue(Op1),
                                DAG.getValueType(DestVT),
@@ -4564,7 +4605,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(2))));
     return nullptr;
   case Intrinsic::fmuladd: {
-    EVT VT = TLI.getValueType(I.getType());
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
         TLI.isFMAFasterThanFMulAndFAdd(VT)) {
       setValue(&I, DAG.getNode(ISD::FMA, sdl,
@@ -4593,10 +4634,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                                                                MVT::i32))));
     return nullptr;
   case Intrinsic::convert_from_fp16:
-    setValue(&I,
-             DAG.getNode(ISD::FP_EXTEND, sdl, TLI.getValueType(I.getType()),
-                         DAG.getNode(ISD::BITCAST, sdl, MVT::f16,
-                                     getValue(I.getArgOperand(0)))));
+    setValue(&I, DAG.getNode(ISD::FP_EXTEND, sdl,
+                             TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                             DAG.getNode(ISD::BITCAST, sdl, MVT::f16,
+                                         getValue(I.getArgOperand(0)))));
     return nullptr;
   case Intrinsic::pcmarker: {
     SDValue Tmp = getValue(I.getArgOperand(0));
@@ -4640,8 +4681,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
-    Res = DAG.getNode(ISD::STACKSAVE, sdl,
-                      DAG.getVTList(TLI.getPointerTy(), MVT::Other), Op);
+    Res = DAG.getNode(
+        ISD::STACKSAVE, sdl,
+        DAG.getVTList(TLI.getPointerTy(DAG.getDataLayout()), MVT::Other), Op);
     setValue(&I, Res);
     DAG.setRoot(Res.getValue(1));
     return nullptr;
@@ -4655,7 +4697,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     // Emit code into the DAG to store the stack guard onto the stack.
     MachineFunction &MF = DAG.getMachineFunction();
     MachineFrameInfo *MFI = MF.getFrameInfo();
-    EVT PtrTy = TLI.getPointerTy();
+    EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
     SDValue Src, Chain = getRoot();
     const Value *Ptr = cast<LoadInst>(I.getArgOperand(0))->getPointerOperand();
     const GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr);
@@ -4753,7 +4795,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::adjust_trampoline: {
     setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, sdl,
-                             TLI.getPointerTy(),
+                             TLI.getPointerTy(DAG.getDataLayout()),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   }
@@ -4794,10 +4836,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     TargetLowering::ArgListTy Args;
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(sdl).setChain(getRoot())
-      .setCallee(CallingConv::C, I.getType(),
-                 DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy()),
-                 std::move(Args), 0);
+    CLI.setDebugLoc(sdl).setChain(getRoot()).setCallee(
+        CallingConv::C, I.getType(),
+        DAG.getExternalSymbol(TrapFuncName.data(),
+                              TLI.getPointerTy(DAG.getDataLayout())),
+        std::move(Args), 0);
 
     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
     DAG.setRoot(Result.second);
@@ -4873,7 +4916,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 
       SDValue Ops[2];
       Ops[0] = getRoot();
-      Ops[1] = DAG.getFrameIndex(FI, TLI.getPointerTy(), true);
+      Ops[1] =
+          DAG.getFrameIndex(FI, TLI.getPointerTy(DAG.getDataLayout()), true);
       unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END);
 
       Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops);
@@ -4883,7 +4927,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::invariant_start:
     // Discard region information.
-    setValue(&I, DAG.getUNDEF(TLI.getPointerTy()));
+    setValue(&I, DAG.getUNDEF(TLI.getPointerTy(DAG.getDataLayout())));
     return nullptr;
   case Intrinsic::invariant_end:
     // Discard region information.
@@ -4903,7 +4947,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::clear_cache:
     return TLI.getClearCacheBuiltinName();
   case Intrinsic::eh_actions:
-    setValue(&I, DAG.getUNDEF(TLI.getPointerTy()));
+    setValue(&I, DAG.getUNDEF(TLI.getPointerTy(DAG.getDataLayout())));
     return nullptr;
   case Intrinsic::donothing:
     // ignore
@@ -4935,11 +4979,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::instrprof_increment:
     llvm_unreachable("instrprof failed to lower an increment");
 
-  case Intrinsic::frameescape: {
+  case Intrinsic::localescape: {
     MachineFunction &MF = DAG.getMachineFunction();
     const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
 
-    // Directly emit some FRAME_ALLOC machine instrs. Label assignment emission
+    // Directly emit some LOCAL_ESCAPE machine instrs. Label assignment emission
     // is the same on all targets.
     for (unsigned Idx = 0, E = I.getNumArgOperands(); Idx < E; ++Idx) {
       Value *Arg = I.getArgOperand(Idx)->stripPointerCasts();
@@ -4953,7 +4997,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
           MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
               GlobalValue::getRealLinkageName(MF.getName()), Idx);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
-              TII->get(TargetOpcode::FRAME_ALLOC))
+              TII->get(TargetOpcode::LOCAL_ESCAPE))
           .addSym(FrameAllocSym)
           .addFrameIndex(FI);
     }
@@ -4961,10 +5005,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
 
-  case Intrinsic::framerecover: {
-    // i8* @llvm.framerecover(i8* %fn, i8* %fp, i32 %idx)
+  case Intrinsic::localrecover: {
+    // i8* @llvm.localrecover(i8* %fn, i8* %fp, i32 %idx)
     MachineFunction &MF = DAG.getMachineFunction();
-    MVT PtrVT = TLI.getPointerTy(0);
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout(), 0);
 
     // Get the symbol that defines the frame offset.
     auto *Fn = cast<Function>(I.getArgOperand(0)->stripPointerCasts());
@@ -4978,7 +5022,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     // that would make this PC relative.
     SDValue OffsetSym = DAG.getMCSymbol(FrameAllocSym, PtrVT);
     SDValue OffsetVal =
-        DAG.getNode(ISD::FRAME_ALLOC_RECOVER, sdl, PtrVT, OffsetSym);
+        DAG.getNode(ISD::LOCAL_RECOVER, sdl, PtrVT, OffsetSym);
 
     // Add the offset to the FP.
     Value *FP = I.getArgOperand(1);
@@ -4994,7 +5038,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::eh_exceptioncode: {
     unsigned Reg = TLI.getExceptionPointerRegister();
     assert(Reg && "cannot get exception code on this platform");
-    MVT PtrVT = TLI.getPointerTy();
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
     const TargetRegisterClass *PtrRC = TLI.getRegClassFor(PtrVT);
     assert(FuncInfo.MBB->isLandingPad() && "eh.exceptioncode in non-lpad");
     unsigned VReg = FuncInfo.MBB->addLiveIn(Reg, PtrRC);
@@ -5178,7 +5222,8 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
 void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
                                                   SDValue Value,
                                                   bool IsSigned) {
-  EVT VT = DAG.getTargetLoweringInfo().getValueType(I.getType(), true);
+  EVT VT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                    I.getType(), true);
   if (IsSigned)
     Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT);
   else
@@ -5203,7 +5248,8 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   const Value *Size = I.getArgOperand(2);
   const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
   if (CSize && CSize->getZExtValue() == 0) {
-    EVT CallVT = DAG.getTargetLoweringInfo().getValueType(I.getType(), true);
+    EVT CallVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                          I.getType(), true);
     setValue(&I, DAG.getConstant(0, getCurSDLoc(), CallVT));
     return true;
   }
@@ -5640,8 +5686,9 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
   if (!RenameFn)
     Callee = getValue(I.getCalledValue());
   else
-    Callee = DAG.getExternalSymbol(RenameFn,
-                                   DAG.getTargetLoweringInfo().getPointerTy());
+    Callee = DAG.getExternalSymbol(
+        RenameFn,
+        DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));
 
   // Check if we can potentially perform a tail call. More detailed checking is
   // be done within LowerCallTo, after more information about the call is known.
@@ -5670,13 +5717,12 @@ public:
   /// getCallOperandValEVT - Return the EVT of the Value* that this operand
   /// corresponds to.  If there is no Value* for this operand, it returns
   /// MVT::Other.
-  EVT getCallOperandValEVT(LLVMContext &Context,
-                           const TargetLowering &TLI,
-                           const DataLayout *DL) const {
+  EVT getCallOperandValEVT(LLVMContext &Context, const TargetLowering &TLI,
+                           const DataLayout &DL) const {
     if (!CallOperandVal) return MVT::Other;
 
     if (isa<BasicBlock>(CallOperandVal))
-      return TLI.getPointerTy();
+      return TLI.getPointerTy(DL);
 
     llvm::Type *OpTy = CallOperandVal->getType();
 
@@ -5698,7 +5744,7 @@ public:
     // If OpTy is not a single value, it may be a struct/union that we
     // can tile with integers.
     if (!OpTy->isSingleValueType() && OpTy->isSized()) {
-      unsigned BitSize = DL->getTypeSizeInBits(OpTy);
+      unsigned BitSize = DL.getTypeSizeInBits(OpTy);
       switch (BitSize) {
       default: break;
       case 1:
@@ -5712,7 +5758,7 @@ public:
       }
     }
 
-    return TLI.getValueType(OpTy, true);
+    return TLI.getValueType(DL, OpTy, true);
   }
 };
 
@@ -5838,8 +5884,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   SDISelAsmOperandInfoVector ConstraintOperands;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  TargetLowering::AsmOperandInfoVector TargetConstraints =
-      TLI.ParseConstraints(DAG.getSubtarget().getRegisterInfo(), CS);
+  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(
+      DAG.getDataLayout(), DAG.getSubtarget().getRegisterInfo(), CS);
 
   bool hasMemory = false;
 
@@ -5864,10 +5910,11 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       // corresponding argument.
       assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
       if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
-        OpVT = TLI.getSimpleValueType(STy->getElementType(ResNo));
+        OpVT = TLI.getSimpleValueType(DAG.getDataLayout(),
+                                      STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
-        OpVT = TLI.getSimpleValueType(CS.getType());
+        OpVT = TLI.getSimpleValueType(DAG.getDataLayout(), CS.getType());
       }
       ++ResNo;
       break;
@@ -5888,8 +5935,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
       }
 
-      OpVT =
-          OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI, DL).getSimpleVT();
+      OpVT = OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI,
+                                         DAG.getDataLayout()).getSimpleVT();
     }
 
     OpInfo.ConstraintVT = OpVT;
@@ -5977,17 +6024,19 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       const Value *OpVal = OpInfo.CallOperandVal;
       if (isa<ConstantFP>(OpVal) || isa<ConstantInt>(OpVal) ||
           isa<ConstantVector>(OpVal) || isa<ConstantDataVector>(OpVal)) {
-        OpInfo.CallOperand = DAG.getConstantPool(cast<Constant>(OpVal),
-                                                 TLI.getPointerTy());
+        OpInfo.CallOperand = DAG.getConstantPool(
+            cast<Constant>(OpVal), TLI.getPointerTy(DAG.getDataLayout()));
       } else {
         // Otherwise, create a stack slot and emit a store to it before the
         // asm.
         Type *Ty = OpVal->getType();
-        uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
-        unsigned Align  = TLI.getDataLayout()->getPrefTypeAlignment(Ty);
+        auto &DL = DAG.getDataLayout();
+        uint64_t TySize = DL.getTypeAllocSize(Ty);
+        unsigned Align = DL.getPrefTypeAlignment(Ty);
         MachineFunction &MF = DAG.getMachineFunction();
         int SSFI = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
-        SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy());
+        SDValue StackSlot =
+            DAG.getFrameIndex(SSFI, TLI.getPointerTy(DAG.getDataLayout()));
         Chain = DAG.getStore(Chain, getCurSDLoc(),
                              OpInfo.CallOperand, StackSlot,
                              MachinePointerInfo::getFixedStack(SSFI),
@@ -6022,9 +6071,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   // AsmNodeOperands - The operands for the ISD::INLINEASM node.
   std::vector<SDValue> AsmNodeOperands;
   AsmNodeOperands.push_back(SDValue());  // reserve space for input chain
-  AsmNodeOperands.push_back(
-          DAG.getTargetExternalSymbol(IA->getAsmString().c_str(),
-                                      TLI.getPointerTy()));
+  AsmNodeOperands.push_back(DAG.getTargetExternalSymbol(
+      IA->getAsmString().c_str(), TLI.getPointerTy(DAG.getDataLayout())));
 
   // If we have a !srcloc metadata node associated with it, we want to attach
   // this to the ultimately generated inline asm machineinstr.  To do this, we
@@ -6064,8 +6112,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     }
   }
 
-  AsmNodeOperands.push_back(DAG.getTargetConstant(ExtraInfo, getCurSDLoc(),
-                                                  TLI.getPointerTy()));
+  AsmNodeOperands.push_back(DAG.getTargetConstant(
+      ExtraInfo, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
 
   // Loop over all of the inputs, copying the operand values into the
   // appropriate registers and processing the output regs.
@@ -6201,8 +6249,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         OpFlag = InlineAsm::convertMemFlagWordToMatchingFlagWord(OpFlag);
         OpFlag = InlineAsm::getFlagWordForMatchingOp(OpFlag,
                                                     OpInfo.getMatchedOperand());
-        AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlag, getCurSDLoc(),
-                                                        TLI.getPointerTy()));
+        AsmNodeOperands.push_back(DAG.getTargetConstant(
+            OpFlag, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
         AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]);
         break;
       }
@@ -6227,16 +6275,16 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         // Add information to the INLINEASM node to know about this input.
         unsigned ResOpType =
           InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
-        AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
-                                                        getCurSDLoc(),
-                                                        TLI.getPointerTy()));
+        AsmNodeOperands.push_back(DAG.getTargetConstant(
+            ResOpType, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
         AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end());
         break;
       }
 
       if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
         assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
-        assert(InOperandVal.getValueType() == TLI.getPointerTy() &&
+        assert(InOperandVal.getValueType() ==
+                   TLI.getPointerTy(DAG.getDataLayout()) &&
                "Memory operands expect pointer values");
 
         unsigned ConstraintID =
@@ -6314,7 +6362,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
     // FIXME: Why don't we do this for inline asms with MRVs?
     if (CS.getType()->isSingleValueType() && CS.getType()->isSized()) {
-      EVT ResultType = TLI.getValueType(CS.getType());
+      EVT ResultType = TLI.getValueType(DAG.getDataLayout(), CS.getType());
 
       // If any of the results of the inline asm is a vector, it may have the
       // wrong width/num elts.  This can happen for register classes that can
@@ -6380,9 +6428,9 @@ void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
 
 void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  const DataLayout &DL = *TLI.getDataLayout();
-  SDValue V = DAG.getVAArg(TLI.getValueType(I.getType()), getCurSDLoc(),
-                           getRoot(), getValue(I.getOperand(0)),
+  const DataLayout &DL = DAG.getDataLayout();
+  SDValue V = DAG.getVAArg(TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                           getCurSDLoc(), getRoot(), getValue(I.getOperand(0)),
                            DAG.getSrcValue(I.getOperand(0)),
                            DL.getABITypeAlignment(I.getType()));
   setValue(&I, V);
@@ -6473,8 +6521,8 @@ static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx,
         Builder.DAG.getTargetConstant(C->getSExtValue(), DL, MVT::i64));
     } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(OpVal)) {
       const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo();
-      Ops.push_back(
-        Builder.DAG.getTargetFrameIndex(FI->getIndex(), TLI.getPointerTy()));
+      Ops.push_back(Builder.DAG.getTargetFrameIndex(
+          FI->getIndex(), TLI.getPointerTy(Builder.DAG.getDataLayout())));
     } else
       Ops.push_back(OpVal);
   }
@@ -6654,7 +6702,7 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
     // Create the return types based on the intrinsic definition
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     SmallVector<EVT, 3> ValueVTs;
-    ComputeValueVTs(TLI, CS->getType(), ValueVTs);
+    ComputeValueVTs(TLI, DAG.getDataLayout(), CS->getType(), ValueVTs);
     assert(ValueVTs.size() == 1 && "Expected only one return value type.");
 
     // There is always a chain and a glue type at the end
@@ -6718,10 +6766,11 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   Type *OrigRetTy = CLI.RetTy;
   SmallVector<EVT, 4> RetTys;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(*this, CLI.RetTy, RetTys, &Offsets);
+  auto &DL = CLI.DAG.getDataLayout();
+  ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets);
 
   SmallVector<ISD::OutputArg, 4> Outs;
-  GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, *this);
+  GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, *this, DL);
 
   bool CanLowerReturn =
       this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(),
@@ -6733,13 +6782,13 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     // FIXME: equivalent assert?
     // assert(!CS.hasInAllocaArgument() &&
     //        "sret demotion is incompatible with inalloca");
-    uint64_t TySize = getDataLayout()->getTypeAllocSize(CLI.RetTy);
-    unsigned Align  = getDataLayout()->getPrefTypeAlignment(CLI.RetTy);
+    uint64_t TySize = DL.getTypeAllocSize(CLI.RetTy);
+    unsigned Align = DL.getPrefTypeAlignment(CLI.RetTy);
     MachineFunction &MF = CLI.DAG.getMachineFunction();
     DemoteStackIdx = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
     Type *StackSlotPtrType = PointerType::getUnqual(CLI.RetTy);
 
-    DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getPointerTy());
+    DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getPointerTy(DL));
     ArgListEntry Entry;
     Entry.Node = DemoteStackSlot;
     Entry.Ty = StackSlotPtrType;
@@ -6784,7 +6833,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   ArgListTy &Args = CLI.getArgs();
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*this, Args[i].Ty, ValueVTs);
+    ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
     Type *FinalType = Args[i].Ty;
     if (Args[i].isByVal)
       FinalType = cast<PointerType>(Args[i].Ty)->getElementType();
@@ -6797,7 +6846,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       SDValue Op = SDValue(Args[i].Node.getNode(),
                            Args[i].Node.getResNo() + Value);
       ISD::ArgFlagsTy Flags;
-      unsigned OriginalAlignment = getDataLayout()->getABITypeAlignment(ArgTy);
+      unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
 
       if (Args[i].isZExt)
         Flags.setZExt();
@@ -6821,14 +6870,14 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       if (Args[i].isByVal || Args[i].isInAlloca) {
         PointerType *Ty = cast<PointerType>(Args[i].Ty);
         Type *ElementTy = Ty->getElementType();
-        Flags.setByValSize(getDataLayout()->getTypeAllocSize(ElementTy));
+        Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
         // For ByVal, alignment should come from FE.  BE will guess if this
         // info is not there but there are cases it cannot get right.
         unsigned FrameAlign;
         if (Args[i].Alignment)
           FrameAlign = Args[i].Alignment;
         else
-          FrameAlign = getByValTypeAlignment(ElementTy);
+          FrameAlign = getByValTypeAlignment(ElementTy, DL);
         Flags.setByValAlign(FrameAlign);
       }
       if (Args[i].isNest)
@@ -6923,7 +6972,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     SmallVector<EVT, 1> PVTs;
     Type *PtrRetTy = PointerType::getUnqual(OrigRetTy);
 
-    ComputeValueVTs(*this, PtrRetTy, PVTs);
+    ComputeValueVTs(*this, DL, PtrRetTy, PVTs);
     assert(PVTs.size() == 1 && "Pointers should fit in one register");
     EVT PtrVT = PVTs[0];
 
@@ -6997,7 +7046,8 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
   assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  RegsForValue RFV(V->getContext(), TLI, Reg, V->getType());
+  RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg,
+                   V->getType());
   SDValue Chain = DAG.getEntryNode();
 
   ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
@@ -7030,13 +7080,14 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
 void SelectionDAGISel::LowerArguments(const Function &F) {
   SelectionDAG &DAG = SDB->DAG;
   SDLoc dl = SDB->getCurSDLoc();
-  const DataLayout *DL = TLI->getDataLayout();
+  const DataLayout &DL = DAG.getDataLayout();
   SmallVector<ISD::InputArg, 16> Ins;
 
   if (!FuncInfo->CanLowerReturn) {
     // Put in an sret pointer parameter before all the other parameters.
     SmallVector<EVT, 1> ValueVTs;
-    ComputeValueVTs(*TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
+    ComputeValueVTs(*TLI, DAG.getDataLayout(),
+                    PointerType::getUnqual(F.getReturnType()), ValueVTs);
 
     // NOTE: Assuming that a pointer will never break down to more than one VT
     // or one register.
@@ -7053,7 +7104,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
   for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
        I != E; ++I, ++Idx) {
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*TLI, I->getType(), ValueVTs);
+    ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs);
     bool isArgValueUsed = !I->use_empty();
     unsigned PartBase = 0;
     Type *FinalType = I->getType();
@@ -7066,7 +7117,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       EVT VT = ValueVTs[Value];
       Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
       ISD::ArgFlagsTy Flags;
-      unsigned OriginalAlignment = DL->getABITypeAlignment(ArgTy);
+      unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
 
       if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
         Flags.setZExt();
@@ -7090,14 +7141,14 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       if (Flags.isByVal() || Flags.isInAlloca()) {
         PointerType *Ty = cast<PointerType>(I->getType());
         Type *ElementTy = Ty->getElementType();
-        Flags.setByValSize(DL->getTypeAllocSize(ElementTy));
+        Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
         // For ByVal, alignment should be passed from FE.  BE will guess if
         // this info is not there but there are cases it cannot get right.
         unsigned FrameAlign;
         if (F.getParamAlignment(Idx))
           FrameAlign = F.getParamAlignment(Idx);
         else
-          FrameAlign = TLI->getByValTypeAlignment(ElementTy);
+          FrameAlign = TLI->getByValTypeAlignment(ElementTy, DL);
         Flags.setByValAlign(FrameAlign);
       }
       if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
@@ -7153,7 +7204,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     // Create a virtual register for the sret pointer, and put in a copy
     // from the sret argument into it.
     SmallVector<EVT, 1> ValueVTs;
-    ComputeValueVTs(*TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
+    ComputeValueVTs(*TLI, DAG.getDataLayout(),
+                    PointerType::getUnqual(F.getReturnType()), ValueVTs);
     MVT VT = ValueVTs[0].getSimpleVT();
     MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
     ISD::NodeType AssertOp = ISD::DELETED_NODE;
@@ -7177,7 +7229,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       ++I, ++Idx) {
     SmallVector<SDValue, 4> ArgValues;
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*TLI, I->getType(), ValueVTs);
+    ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
 
     // If this argument is unused then remember its value. It is used to generate
@@ -7324,7 +7376,7 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       // the input for this MBB.
       SmallVector<EVT, 4> ValueVTs;
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      ComputeValueVTs(TLI, PN->getType(), ValueVTs);
+      ComputeValueVTs(TLI, DAG.getDataLayout(), PN->getType(), ValueVTs);
       for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
         EVT VT = ValueVTs[vti];
         unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
@@ -7595,7 +7647,7 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters,
 
 bool SelectionDAGBuilder::rangeFitsInWord(const APInt &Low, const APInt &High) {
   // FIXME: Using the pointer type doesn't seem ideal.
-  uint64_t BW = DAG.getTargetLoweringInfo().getPointerTy().getSizeInBits();
+  uint64_t BW = DAG.getDataLayout().getPointerSizeInBits();
   uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1;
   return Range <= BW;
 }
@@ -7650,8 +7702,9 @@ bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters,
   APInt LowBound;
   APInt CmpRange;
 
-  const int BitWidth =
-      DAG.getTargetLoweringInfo().getPointerTy().getSizeInBits();
+  const int BitWidth = DAG.getTargetLoweringInfo()
+                           .getPointerTy(DAG.getDataLayout())
+                           .getSizeInBits();
   assert(rangeFitsInWord(Low, High) && "Case range must fit in bit mask!");
 
   if (Low.isNonNegative() && High.slt(BitWidth)) {
@@ -7731,7 +7784,7 @@ void SelectionDAGBuilder::findBitTestClusters(CaseClusterVector &Clusters,
 
   // If target does not have legal shift left, do not emit bit tests at all.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT PTy = TLI.getPointerTy();
+  EVT PTy = TLI.getPointerTy(DAG.getDataLayout());
   if (!TLI.isOperationLegal(ISD::SHL, PTy))
     return;
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index f225d54d189d..700675453fe7 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -755,8 +755,6 @@ public:
   void visitJumpTable(JumpTable &JT);
   void visitJumpTableHeader(JumpTable &JT, JumpTableHeader &JTH,
                             MachineBasicBlock *SwitchBB);
-  unsigned visitLandingPadClauseBB(GlobalValue *ClauseGV,
-                                   MachineBasicBlock *LPadMBB);
 
 private:
   // These all get lowered before this pass.
@@ -915,8 +913,8 @@ struct RegsForValue {
 
   RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt, EVT valuevt);
 
-  RegsForValue(LLVMContext &Context, const TargetLowering &tli, unsigned Reg,
-               Type *Ty);
+  RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
+               const DataLayout &DL, unsigned Reg, Type *Ty);
 
   /// append - Add the specified values to this one.
   void append(const RegsForValue &RHS) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index ef468a2b1c54..5b9b18286fae 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -95,7 +95,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::GLOBAL_OFFSET_TABLE:        return "GLOBAL_OFFSET_TABLE";
   case ISD::RETURNADDR:                 return "RETURNADDR";
   case ISD::FRAMEADDR:                  return "FRAMEADDR";
-  case ISD::FRAME_ALLOC_RECOVER:        return "FRAME_ALLOC_RECOVER";
+  case ISD::LOCAL_RECOVER:        return "LOCAL_RECOVER";
   case ISD::READ_REGISTER:              return "READ_REGISTER";
   case ISD::WRITE_REGISTER:             return "WRITE_REGISTER";
   case ISD::FRAME_TO_ARGS_OFFSET:       return "FRAME_TO_ARGS_OFFSET";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 31f8210f40f0..97ece8b9248a 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -921,7 +921,8 @@ void SelectionDAGISel::DoInstructionSelection() {
 bool SelectionDAGISel::PrepareEHLandingPad() {
   MachineBasicBlock *MBB = FuncInfo->MBB;
 
-  const TargetRegisterClass *PtrRC = TLI->getRegClassFor(TLI->getPointerTy());
+  const TargetRegisterClass *PtrRC =
+      TLI->getRegClassFor(TLI->getPointerTy(CurDAG->getDataLayout()));
 
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
@@ -1931,7 +1932,8 @@ SDNode
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   unsigned Reg =
-      TLI->getRegisterByName(RegStr->getString().data(), Op->getValueType(0));
+      TLI->getRegisterByName(RegStr->getString().data(), Op->getValueType(0),
+                             *CurDAG);
   SDValue New = CurDAG->getCopyFromReg(
                         Op->getOperand(0), dl, Reg, Op->getValueType(0));
   New->setNodeId(-1);
@@ -1944,7 +1946,8 @@ SDNode
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   unsigned Reg = TLI->getRegisterByName(RegStr->getString().data(),
-                                        Op->getOperand(2).getValueType());
+                                        Op->getOperand(2).getValueType(),
+                                        *CurDAG);
   SDValue New = CurDAG->getCopyToReg(
                         Op->getOperand(0), dl, Reg, Op->getOperand(2));
   New->setNodeId(-1);
@@ -2329,21 +2332,23 @@ CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
-CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-          SDValue N, const TargetLowering *TLI) {
+CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
+          const TargetLowering *TLI, const DataLayout &DL) {
   MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
   if (N.getValueType() == VT) return true;
 
   // Handle the case when VT is iPTR.
-  return VT == MVT::iPTR && N.getValueType() == TLI->getPointerTy();
+  return VT == MVT::iPTR && N.getValueType() == TLI->getPointerTy(DL);
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-               SDValue N, const TargetLowering *TLI, unsigned ChildNo) {
+               SDValue N, const TargetLowering *TLI, const DataLayout &DL,
+               unsigned ChildNo) {
   if (ChildNo >= N.getNumOperands())
     return false;  // Match fails if out of range child #.
-  return ::CheckType(MatcherTable, MatcherIndex, N.getOperand(ChildNo), TLI);
+  return ::CheckType(MatcherTable, MatcherIndex, N.getOperand(ChildNo), TLI,
+                     DL);
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
@@ -2355,13 +2360,13 @@ CheckCondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-               SDValue N, const TargetLowering *TLI) {
+               SDValue N, const TargetLowering *TLI, const DataLayout &DL) {
   MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
   if (cast<VTSDNode>(N)->getVT() == VT)
     return true;
 
   // Handle the case when VT is iPTR.
-  return VT == MVT::iPTR && cast<VTSDNode>(N)->getVT() == TLI->getPointerTy();
+  return VT == MVT::iPTR && cast<VTSDNode>(N)->getVT() == TLI->getPointerTy(DL);
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
@@ -2444,7 +2449,8 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
     Result = !::CheckOpcode(Table, Index, N.getNode());
     return Index;
   case SelectionDAGISel::OPC_CheckType:
-    Result = !::CheckType(Table, Index, N, SDISel.TLI);
+    Result = !::CheckType(Table, Index, N, SDISel.TLI,
+                          SDISel.CurDAG->getDataLayout());
     return Index;
   case SelectionDAGISel::OPC_CheckChild0Type:
   case SelectionDAGISel::OPC_CheckChild1Type:
@@ -2454,15 +2460,16 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
   case SelectionDAGISel::OPC_CheckChild5Type:
   case SelectionDAGISel::OPC_CheckChild6Type:
   case SelectionDAGISel::OPC_CheckChild7Type:
-    Result = !::CheckChildType(Table, Index, N, SDISel.TLI,
-                               Table[Index - 1] -
-                                   SelectionDAGISel::OPC_CheckChild0Type);
+    Result = !::CheckChildType(
+                 Table, Index, N, SDISel.TLI, SDISel.CurDAG->getDataLayout(),
+                 Table[Index - 1] - SelectionDAGISel::OPC_CheckChild0Type);
     return Index;
   case SelectionDAGISel::OPC_CheckCondCode:
     Result = !::CheckCondCode(Table, Index, N);
     return Index;
   case SelectionDAGISel::OPC_CheckValueType:
-    Result = !::CheckValueType(Table, Index, N, SDISel.TLI);
+    Result = !::CheckValueType(Table, Index, N, SDISel.TLI,
+                               SDISel.CurDAG->getDataLayout());
     return Index;
   case SelectionDAGISel::OPC_CheckInteger:
     Result = !::CheckInteger(Table, Index, N);
@@ -2816,7 +2823,8 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       continue;
 
     case OPC_CheckType:
-      if (!::CheckType(MatcherTable, MatcherIndex, N, TLI))
+      if (!::CheckType(MatcherTable, MatcherIndex, N, TLI,
+                       CurDAG->getDataLayout()))
         break;
       continue;
 
@@ -2864,7 +2872,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
 
         MVT CaseVT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
         if (CaseVT == MVT::iPTR)
-          CaseVT = TLI->getPointerTy();
+          CaseVT = TLI->getPointerTy(CurDAG->getDataLayout());
 
         // If the VT matches, then we will execute this case.
         if (CurNodeVT == CaseVT)
@@ -2887,14 +2895,16 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_CheckChild4Type: case OPC_CheckChild5Type:
     case OPC_CheckChild6Type: case OPC_CheckChild7Type:
       if (!::CheckChildType(MatcherTable, MatcherIndex, N, TLI,
-                            Opcode-OPC_CheckChild0Type))
+                            CurDAG->getDataLayout(),
+                            Opcode - OPC_CheckChild0Type))
         break;
       continue;
     case OPC_CheckCondCode:
       if (!::CheckCondCode(MatcherTable, MatcherIndex, N)) break;
       continue;
     case OPC_CheckValueType:
-      if (!::CheckValueType(MatcherTable, MatcherIndex, N, TLI))
+      if (!::CheckValueType(MatcherTable, MatcherIndex, N, TLI,
+                            CurDAG->getDataLayout()))
         break;
       continue;
     case OPC_CheckInteger:
@@ -3097,7 +3107,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         MVT::SimpleValueType VT =
           (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
         if (VT == MVT::iPTR)
-          VT = TLI->getPointerTy().SimpleTy;
+          VT = TLI->getPointerTy(CurDAG->getDataLayout()).SimpleTy;
         VTs.push_back(VT);
       }
 
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index bd40cac95543..34688df4765b 100644
--- a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -337,9 +337,9 @@ lowerCallFromStatepoint(ImmutableStatepoint ISP, MachineBasicBlock *LandingPad,
       // TODO: To eliminate this problem we can remove gc.result intrinsics
       //       completelly and make statepoint call to return a tuple.
       unsigned Reg = Builder.FuncInfo.CreateRegs(ISP.getActualReturnType());
-      RegsForValue RFV(*Builder.DAG.getContext(),
-                       Builder.DAG.getTargetLoweringInfo(), Reg,
-                       ISP.getActualReturnType());
+      RegsForValue RFV(
+          *Builder.DAG.getContext(), Builder.DAG.getTargetLoweringInfo(),
+          Builder.DAG.getDataLayout(), Reg, ISP.getActualReturnType());
       SDValue Chain = Builder.DAG.getEntryNode();
 
       RFV.getCopyToRegs(ReturnValue, Builder.DAG, Builder.getCurSDLoc(), Chain,
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e7722b392a81..fbf651277c7f 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -102,7 +102,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG,
   }
   if (LC == RTLIB::UNKNOWN_LIBCALL)
     report_fatal_error("Unsupported library call operation!");
-  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy());
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -206,14 +207,16 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   NewRHS = DAG.getConstant(0, dl, RetVT);
   CCCode = getCmpLibcallCC(LC1);
   if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
-    SDValue Tmp = DAG.getNode(ISD::SETCC, dl,
-                              getSetCCResultType(*DAG.getContext(), RetVT),
-                              NewLHS, NewRHS, DAG.getCondCode(CCCode));
+    SDValue Tmp = DAG.getNode(
+        ISD::SETCC, dl,
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
+        NewLHS, NewRHS, DAG.getCondCode(CCCode));
     NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, 2, false/*sign irrelevant*/,
                          dl).first;
-    NewLHS = DAG.getNode(ISD::SETCC, dl,
-                         getSetCCResultType(*DAG.getContext(), RetVT), NewLHS,
-                         NewRHS, DAG.getCondCode(getCmpLibcallCC(LC2)));
+    NewLHS = DAG.getNode(
+        ISD::SETCC, dl,
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
+        NewLHS, NewRHS, DAG.getCondCode(getCmpLibcallCC(LC2)));
     NewLHS = DAG.getNode(ISD::OR, dl, Tmp.getValueType(), Tmp, NewLHS);
     NewRHS = SDValue();
   }
@@ -242,7 +245,7 @@ SDValue TargetLowering::getPICJumpTableRelocBase(SDValue Table,
 
   if ((JTEncoding == MachineJumpTableInfo::EK_GPRel64BlockAddress) ||
       (JTEncoding == MachineJumpTableInfo::EK_GPRel32BlockAddress))
-    return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy(0));
+    return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy(DAG.getDataLayout()));
 
   return Table;
 }
@@ -265,9 +268,7 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 
   // In dynamic-no-pic mode, assume that known defined values are safe.
   if (getTargetMachine().getRelocationModel() == Reloc::DynamicNoPIC &&
-      GA &&
-      !GA->getGlobal()->isDeclaration() &&
-      !GA->getGlobal()->isWeakForLinker())
+      GA && GA->getGlobal()->isStrongDefinitionForLinker())
     return true;
 
   // Otherwise assume nothing is safe.
@@ -383,6 +384,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
          "Mask size mismatches value type size!");
   APInt NewMask = DemandedMask;
   SDLoc dl(Op);
+  auto &DL = TLO.DAG.getDataLayout();
 
   // Don't know anything.
   KnownZero = KnownOne = APInt(BitWidth, 0);
@@ -645,7 +647,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         unsigned InnerBits = InnerVT.getSizeInBits();
         if (ShAmt < InnerBits && NewMask.lshr(InnerBits) == 0 &&
             isTypeDesirableForOp(ISD::SHL, InnerVT)) {
-          EVT ShTy = getShiftAmountTy(InnerVT);
+          EVT ShTy = getShiftAmountTy(InnerVT, DL);
           if (!APInt(BitWidth, ShAmt).isIntN(ShTy.getSizeInBits()))
             ShTy = InnerVT;
           SDValue NarrowShl =
@@ -824,7 +826,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         // for scalar types after legalization.
         EVT ShiftAmtTy = Op.getValueType();
         if (TLO.LegalTypes() && !ShiftAmtTy.isVector())
-          ShiftAmtTy = getShiftAmountTy(ShiftAmtTy);
+          ShiftAmtTy = getShiftAmountTy(ShiftAmtTy, DL);
 
         SDValue ShiftAmt = TLO.DAG.getConstant(BitWidth - ShAmt, dl,
                                                ShiftAmtTy);
@@ -1009,8 +1011,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         SDValue Shift = In.getOperand(1);
         if (TLO.LegalTypes()) {
           uint64_t ShVal = ShAmt->getZExtValue();
-          Shift =
-            TLO.DAG.getConstant(ShVal, dl, getShiftAmountTy(Op.getValueType()));
+          Shift = TLO.DAG.getConstant(ShVal, dl,
+                                      getShiftAmountTy(Op.getValueType(), DL));
         }
 
         APInt HighBits = APInt::getHighBitsSet(OperandBitWidth,
@@ -1400,7 +1402,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           APInt newMask = APInt::getLowBitsSet(maskWidth, width);
           for (unsigned offset=0; offset<origWidth/width; offset++) {
             if ((newMask & Mask) == Mask) {
-              if (!getDataLayout()->isLittleEndian())
+              if (!DAG.getDataLayout().isLittleEndian())
                 bestOffset = (origWidth/width - offset - 1) * (width/8);
               else
                 bestOffset = (uint64_t)offset * (width/8);
@@ -1473,7 +1475,8 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         if (DCI.isBeforeLegalizeOps() ||
             (isOperationLegal(ISD::SETCC, newVT) &&
              getCondCodeAction(Cond, newVT.getSimpleVT()) == Legal)) {
-          EVT NewSetCCVT = getSetCCResultType(*DAG.getContext(), newVT);
+          EVT NewSetCCVT =
+              getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), newVT);
           SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
 
           SDValue NewSetCC = DAG.getSetCC(dl, NewSetCCVT, N0.getOperand(0),
@@ -1692,11 +1695,13 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
         (VT == N0.getValueType() ||
          (isTypeLegal(VT) && VT.bitsLE(N0.getValueType()))) &&
-        N0.getOpcode() == ISD::AND)
+        N0.getOpcode() == ISD::AND) {
+      auto &DL = DAG.getDataLayout();
       if (ConstantSDNode *AndRHS =
                   dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
-        EVT ShiftTy = DCI.isBeforeLegalize() ?
-          getPointerTy() : getShiftAmountTy(N0.getValueType());
+        EVT ShiftTy = DCI.isBeforeLegalize()
+                          ? getPointerTy(DL)
+                          : getShiftAmountTy(N0.getValueType(), DL);
         if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0  -->  (X & 8) >> 3
           // Perform the xform if the AND RHS is a single bit.
           if (AndRHS->getAPIntValue().isPowerOf2()) {
@@ -1716,6 +1721,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           }
         }
       }
+    }
 
     if (C1.getMinSignedBits() <= 64 &&
         !isLegalICmpImmediate(C1.getSExtValue())) {
@@ -1727,8 +1733,10 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           const APInt &AndRHSC = AndRHS->getAPIntValue();
           if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) {
             unsigned ShiftBits = AndRHSC.countTrailingZeros();
-            EVT ShiftTy = DCI.isBeforeLegalize() ?
-              getPointerTy() : getShiftAmountTy(N0.getValueType());
+            auto &DL = DAG.getDataLayout();
+            EVT ShiftTy = DCI.isBeforeLegalize()
+                              ? getPointerTy(DL)
+                              : getShiftAmountTy(N0.getValueType(), DL);
             EVT CmpTy = N0.getValueType();
             SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0.getOperand(0),
                                         DAG.getConstant(ShiftBits, dl,
@@ -1757,8 +1765,10 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         NewC = NewC.lshr(ShiftBits);
         if (ShiftBits && NewC.getMinSignedBits() <= 64 &&
           isLegalICmpImmediate(NewC.getSExtValue())) {
-          EVT ShiftTy = DCI.isBeforeLegalize() ?
-            getPointerTy() : getShiftAmountTy(N0.getValueType());
+          auto &DL = DAG.getDataLayout();
+          EVT ShiftTy = DCI.isBeforeLegalize()
+                            ? getPointerTy(DL)
+                            : getShiftAmountTy(N0.getValueType(), DL);
           EVT CmpTy = N0.getValueType();
           SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0,
                                       DAG.getConstant(ShiftBits, dl, ShiftTy));
@@ -1945,10 +1955,12 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                                 Cond);
           if (N0.getNode()->hasOneUse()) {
             assert(N0.getOpcode() == ISD::SUB && "Unexpected operation!");
+            auto &DL = DAG.getDataLayout();
             // (Z-X) == X  --> Z == X<<1
-            SDValue SH = DAG.getNode(ISD::SHL, dl, N1.getValueType(), N1,
-                       DAG.getConstant(1, dl,
-                                       getShiftAmountTy(N1.getValueType())));
+            SDValue SH = DAG.getNode(
+                ISD::SHL, dl, N1.getValueType(), N1,
+                DAG.getConstant(1, dl,
+                                getShiftAmountTy(N1.getValueType(), DL)));
             if (!DCI.isCalledByLegalizer())
               DCI.AddToWorklist(SH.getNode());
             return DAG.getSetCC(dl, VT, N0.getOperand(0), SH, Cond);
@@ -1969,10 +1981,11 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                           DAG.getConstant(0, dl, N1.getValueType()), Cond);
         if (N1.getNode()->hasOneUse()) {
           assert(N1.getOpcode() == ISD::SUB && "Unexpected operation!");
+          auto &DL = DAG.getDataLayout();
           // X == (Z-X)  --> X<<1 == Z
-          SDValue SH = DAG.getNode(ISD::SHL, dl, N1.getValueType(), N0,
-                       DAG.getConstant(1, dl,
-                                       getShiftAmountTy(N0.getValueType())));
+          SDValue SH = DAG.getNode(
+              ISD::SHL, dl, N1.getValueType(), N0,
+              DAG.getConstant(1, dl, getShiftAmountTy(N0.getValueType(), DL)));
           if (!DCI.isCalledByLegalizer())
             DCI.AddToWorklist(SH.getNode());
           return DAG.getSetCC(dl, VT, SH, N1.getOperand(0), Cond);
@@ -2105,9 +2118,8 @@ PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
 //  Inline Assembler Implementation Methods
 //===----------------------------------------------------------------------===//
 
-
 TargetLowering::ConstraintType
-TargetLowering::getConstraintType(const std::string &Constraint) const {
+TargetLowering::getConstraintType(StringRef Constraint) const {
   unsigned S = Constraint.size();
 
   if (S == 1) {
@@ -2140,7 +2152,7 @@ TargetLowering::getConstraintType(const std::string &Constraint) const {
   }
 
   if (S > 1 && Constraint[0] == '{' && Constraint[S-1] == '}') {
-    if (S == 8 && !Constraint.compare(1, 6, "memory", 6))  // "{memory}"
+    if (S == 8 && Constraint.substr(1, 6) == "memory") // "{memory}"
       return C_Memory;
     return C_Register;
   }
@@ -2206,8 +2218,8 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(),
                                                  C ? SDLoc(C) : SDLoc(),
                                                  Op.getValueType(), Offs));
-        return;
       }
+      return;
     }
     if (C) {   // just C, no GV.
       // Simple constants are not allowed for 's'.
@@ -2217,8 +2229,8 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         // ScheduleDAGSDNodes::EmitNode, which is very generic.
         Ops.push_back(DAG.getTargetConstant(C->getAPIntValue().getSExtValue(),
                                             SDLoc(C), MVT::i64));
-        return;
       }
+      return;
     }
     break;
   }
@@ -2227,7 +2239,7 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
 std::pair<unsigned, const TargetRegisterClass *>
 TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
-                                             const std::string &Constraint,
+                                             StringRef Constraint,
                                              MVT VT) const {
   if (Constraint.empty() || Constraint[0] != '{')
     return std::make_pair(0u, static_cast<TargetRegisterClass*>(nullptr));
@@ -2293,7 +2305,8 @@ unsigned TargetLowering::AsmOperandInfo::getMatchedOperand() const {
 /// If this returns an empty vector, and if the constraint string itself
 /// isn't empty, there was an error parsing.
 TargetLowering::AsmOperandInfoVector
-TargetLowering::ParseConstraints(const TargetRegisterInfo *TRI,
+TargetLowering::ParseConstraints(const DataLayout &DL,
+                                 const TargetRegisterInfo *TRI,
                                  ImmutableCallSite CS) const {
   /// ConstraintOperands - Information about all of the constraints.
   AsmOperandInfoVector ConstraintOperands;
@@ -2329,10 +2342,11 @@ TargetLowering::ParseConstraints(const TargetRegisterInfo *TRI,
       assert(!CS.getType()->isVoidTy() &&
              "Bad inline asm!");
       if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
-        OpInfo.ConstraintVT = getSimpleValueType(STy->getElementType(ResNo));
+        OpInfo.ConstraintVT =
+            getSimpleValueType(DL, STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
-        OpInfo.ConstraintVT = getSimpleValueType(CS.getType());
+        OpInfo.ConstraintVT = getSimpleValueType(DL, CS.getType());
       }
       ++ResNo;
       break;
@@ -2361,7 +2375,7 @@ TargetLowering::ParseConstraints(const TargetRegisterInfo *TRI,
       // If OpTy is not a single value, it may be a struct/union that we
       // can tile with integers.
       if (!OpTy->isSingleValueType() && OpTy->isSized()) {
-        unsigned BitSize = getDataLayout()->getTypeSizeInBits(OpTy);
+        unsigned BitSize = DL.getTypeSizeInBits(OpTy);
         switch (BitSize) {
         default: break;
         case 1:
@@ -2375,8 +2389,7 @@ TargetLowering::ParseConstraints(const TargetRegisterInfo *TRI,
           break;
         }
       } else if (PointerType *PT = dyn_cast<PointerType>(OpTy)) {
-        unsigned PtrSize
-          = getDataLayout()->getPointerSizeInBits(PT->getAddressSpace());
+        unsigned PtrSize = DL.getPointerSizeInBits(PT->getAddressSpace());
         OpInfo.ConstraintVT = MVT::getIntegerVT(PtrSize);
       } else {
         OpInfo.ConstraintVT = MVT::getVT(OpTy, true);
@@ -2684,7 +2697,8 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d,
   if (ShAmt) {
     // TODO: For UDIV use SRL instead of SRA.
     SDValue Amt =
-        DAG.getConstant(ShAmt, dl, TLI.getShiftAmountTy(Op1.getValueType()));
+        DAG.getConstant(ShAmt, dl, TLI.getShiftAmountTy(Op1.getValueType(),
+                                                        DAG.getDataLayout()));
     SDNodeFlags Flags;
     Flags.setExact(true);
     Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt, &Flags);
@@ -2750,17 +2764,19 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, const APInt &Divisor,
     Q = DAG.getNode(ISD::SUB, dl, VT, Q, N->getOperand(0));
     Created->push_back(Q.getNode());
   }
+  auto &DL = DAG.getDataLayout();
   // Shift right algebraic if shift value is nonzero
   if (magics.s > 0) {
-    Q = DAG.getNode(ISD::SRA, dl, VT, Q,
-                    DAG.getConstant(magics.s, dl,
-                                    getShiftAmountTy(Q.getValueType())));
+    Q = DAG.getNode(
+        ISD::SRA, dl, VT, Q,
+        DAG.getConstant(magics.s, dl, getShiftAmountTy(Q.getValueType(), DL)));
     Created->push_back(Q.getNode());
   }
   // Extract the sign bit and add it to the quotient
-  SDValue T = DAG.getNode(ISD::SRL, dl, VT, Q,
-                          DAG.getConstant(VT.getScalarSizeInBits() - 1, dl,
-                                          getShiftAmountTy(Q.getValueType())));
+  SDValue T =
+      DAG.getNode(ISD::SRL, dl, VT, Q,
+                  DAG.getConstant(VT.getScalarSizeInBits() - 1, dl,
+                                  getShiftAmountTy(Q.getValueType(), DL)));
   Created->push_back(T.getNode());
   return DAG.getNode(ISD::ADD, dl, VT, Q, T);
 }
@@ -2776,6 +2792,7 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor,
 
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
+  auto &DL = DAG.getDataLayout();
 
   // Check to see if we can do this.
   // FIXME: We should be more aggressive here.
@@ -2792,9 +2809,9 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor,
   // the divided value upfront.
   if (magics.a != 0 && !Divisor[0]) {
     unsigned Shift = Divisor.countTrailingZeros();
-    Q = DAG.getNode(ISD::SRL, dl, VT, Q,
-                    DAG.getConstant(Shift, dl,
-                                    getShiftAmountTy(Q.getValueType())));
+    Q = DAG.getNode(
+        ISD::SRL, dl, VT, Q,
+        DAG.getConstant(Shift, dl, getShiftAmountTy(Q.getValueType(), DL)));
     Created->push_back(Q.getNode());
 
     // Get magic number for the shifted divisor.
@@ -2819,21 +2836,22 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor,
   if (magics.a == 0) {
     assert(magics.s < Divisor.getBitWidth() &&
            "We shouldn't generate an undefined shift!");
-    return DAG.getNode(ISD::SRL, dl, VT, Q,
-                       DAG.getConstant(magics.s, dl,
-                                       getShiftAmountTy(Q.getValueType())));
+    return DAG.getNode(
+        ISD::SRL, dl, VT, Q,
+        DAG.getConstant(magics.s, dl, getShiftAmountTy(Q.getValueType(), DL)));
   } else {
     SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Q);
     Created->push_back(NPQ.getNode());
-    NPQ = DAG.getNode(ISD::SRL, dl, VT, NPQ,
-                      DAG.getConstant(1, dl,
-                                      getShiftAmountTy(NPQ.getValueType())));
+    NPQ = DAG.getNode(
+        ISD::SRL, dl, VT, NPQ,
+        DAG.getConstant(1, dl, getShiftAmountTy(NPQ.getValueType(), DL)));
     Created->push_back(NPQ.getNode());
     NPQ = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
     Created->push_back(NPQ.getNode());
-    return DAG.getNode(ISD::SRL, dl, VT, NPQ,
-                       DAG.getConstant(magics.s - 1, dl,
-                                       getShiftAmountTy(NPQ.getValueType())));
+    return DAG.getNode(
+        ISD::SRL, dl, VT, NPQ,
+        DAG.getConstant(magics.s - 1, dl,
+                        getShiftAmountTy(NPQ.getValueType(), DL)));
   }
 }
 
@@ -2919,8 +2937,9 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
     if (!LH.getNode() && !RH.getNode() &&
         isOperationLegalOrCustom(ISD::SRL, VT) &&
         isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
+      auto &DL = DAG.getDataLayout();
       unsigned ShiftAmt = VT.getSizeInBits() - HiLoVT.getSizeInBits();
-      SDValue Shift = DAG.getConstant(ShiftAmt, dl, getShiftAmountTy(VT));
+      SDValue Shift = DAG.getConstant(ShiftAmt, dl, getShiftAmountTy(VT, DL));
       LH = DAG.getNode(ISD::SRL, dl, VT, N->getOperand(0), Shift);
       LH = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, LH);
       RH = DAG.getNode(ISD::SRL, dl, VT, N->getOperand(1), Shift);
@@ -2980,14 +2999,15 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
 
   SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Node->getOperand(0));
 
-  SDValue ExponentBits = DAG.getNode(ISD::SRL, dl, IntVT,
-      DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask),
-      DAG.getZExtOrTrunc(ExponentLoBit, dl, getShiftAmountTy(IntVT)));
+  auto &DL = DAG.getDataLayout();
+  SDValue ExponentBits = DAG.getNode(
+      ISD::SRL, dl, IntVT, DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask),
+      DAG.getZExtOrTrunc(ExponentLoBit, dl, getShiftAmountTy(IntVT, DL)));
   SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias);
 
-  SDValue Sign = DAG.getNode(ISD::SRA, dl, IntVT,
-      DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
-      DAG.getZExtOrTrunc(SignLowBit, dl, getShiftAmountTy(IntVT)));
+  SDValue Sign = DAG.getNode(
+      ISD::SRA, dl, IntVT, DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
+      DAG.getZExtOrTrunc(SignLowBit, dl, getShiftAmountTy(IntVT, DL)));
   Sign = DAG.getSExtOrTrunc(Sign, dl, NVT);
 
   SDValue R = DAG.getNode(ISD::OR, dl, IntVT,
@@ -2996,17 +3016,17 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
 
   R = DAG.getZExtOrTrunc(R, dl, NVT);
 
-
-  R = DAG.getSelectCC(dl, Exponent, ExponentLoBit,
-     DAG.getNode(ISD::SHL, dl, NVT, R,
-                 DAG.getZExtOrTrunc(
-                    DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit),
-                    dl, getShiftAmountTy(IntVT))),
-     DAG.getNode(ISD::SRL, dl, NVT, R,
-                 DAG.getZExtOrTrunc(
-                    DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent),
-                    dl, getShiftAmountTy(IntVT))),
-     ISD::SETGT);
+  R = DAG.getSelectCC(
+      dl, Exponent, ExponentLoBit,
+      DAG.getNode(ISD::SHL, dl, NVT, R,
+                  DAG.getZExtOrTrunc(
+                      DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit),
+                      dl, getShiftAmountTy(IntVT, DL))),
+      DAG.getNode(ISD::SRL, dl, NVT, R,
+                  DAG.getZExtOrTrunc(
+                      DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent),
+                      dl, getShiftAmountTy(IntVT, DL))),
+      ISD::SETGT);
 
   SDValue Ret = DAG.getNode(ISD::SUB, dl, NVT,
       DAG.getNode(ISD::XOR, dl, NVT, R, Sign),
diff --git a/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp b/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
index 0e89bad5f26f..00db94256844 100644
--- a/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
@@ -15,9 +15,5 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-TargetSelectionDAGInfo::TargetSelectionDAGInfo(const DataLayout *DL)
-  : DL(DL) {
-}
-
 TargetSelectionDAGInfo::~TargetSelectionDAGInfo() {
 }
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 116fd5be0337..d236e1f5ab6f 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -45,7 +45,6 @@ STATISTIC(NumSpilled, "Number of registers live across unwind edges");
 
 namespace {
 class SjLjEHPrepare : public FunctionPass {
-  const TargetMachine *TM;
   Type *doubleUnderDataTy;
   Type *doubleUnderJBufTy;
   Type *FunctionContextTy;
@@ -63,7 +62,7 @@ class SjLjEHPrepare : public FunctionPass {
 
 public:
   static char ID; // Pass identification, replacement for typeid
-  explicit SjLjEHPrepare(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {}
+  explicit SjLjEHPrepare() : FunctionPass(ID) {}
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
 
@@ -83,11 +82,11 @@ private:
 } // end anonymous namespace
 
 char SjLjEHPrepare::ID = 0;
+INITIALIZE_PASS(SjLjEHPrepare, "sjljehprepare", "Prepare SjLj exceptions",
+                false, false)
 
 // Public Interface To the SjLjEHPrepare pass.
-FunctionPass *llvm::createSjLjEHPreparePass(const TargetMachine *TM) {
-  return new SjLjEHPrepare(TM);
-}
+FunctionPass *llvm::createSjLjEHPreparePass() { return new SjLjEHPrepare(); }
 // doInitialization - Set up decalarations and types needed to process
 // exceptions.
 bool SjLjEHPrepare::doInitialization(Module &M) {
@@ -196,9 +195,8 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,
   // Create an alloca for the incoming jump buffer ptr and the new jump buffer
   // that needs to be restored on all exits from the function. This is an alloca
   // because the value needs to be added to the global context list.
-  const TargetLowering *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
-  unsigned Align =
-      TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
+  auto &DL = F.getParent()->getDataLayout();
+  unsigned Align = DL.getPrefTypeAlignment(FunctionContextTy);
   FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context",
                            EntryBB->begin());
 
diff --git a/lib/CodeGen/StackMapLivenessAnalysis.cpp b/lib/CodeGen/StackMapLivenessAnalysis.cpp
index d88be575d56c..855058358fe4 100644
--- a/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -49,7 +49,6 @@ namespace {
 /// information provided by this pass is optional and not required by the
 /// aformentioned intrinsic to function.
 class StackMapLiveness : public MachineFunctionPass {
-  MachineFunction *MF;
   const TargetRegisterInfo *TRI;
   LivePhysRegs LiveRegs;
 
@@ -68,14 +67,14 @@ public:
 
 private:
   /// \brief Performs the actual liveness calculation for the function.
-  bool calculateLiveness();
+  bool calculateLiveness(MachineFunction &MF);
 
   /// \brief Add the current register live set to the instruction.
-  void addLiveOutSetToMI(MachineInstr &MI);
+  void addLiveOutSetToMI(MachineFunction &MF, MachineInstr &MI);
 
   /// \brief Create a register mask and initialize it with the registers from
   /// the register live set.
-  uint32_t *createRegisterMask() const;
+  uint32_t *createRegisterMask(MachineFunction &MF) const;
 };
 } // namespace
 
@@ -95,8 +94,7 @@ void StackMapLiveness::getAnalysisUsage(AnalysisUsage &AU) const {
   // We preserve all information.
   AU.setPreservesAll();
   AU.setPreservesCFG();
-  // Default dependencie for all MachineFunction passes.
-  AU.addRequired<MachineFunctionAnalysis>();
+  MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 /// Calculate the liveness information for the given machine function.
@@ -106,7 +104,6 @@ bool StackMapLiveness::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: " << MF.getName()
                << " **********\n");
-  this->MF = &MF;
   TRI = MF.getSubtarget().getRegisterInfo();
   ++NumStackMapFuncVisited;
 
@@ -115,25 +112,23 @@ bool StackMapLiveness::runOnMachineFunction(MachineFunction &MF) {
     ++NumStackMapFuncSkipped;
     return false;
   }
-  return calculateLiveness();
+  return calculateLiveness(MF);
 }
 
 /// Performs the actual liveness calculation for the function.
-bool StackMapLiveness::calculateLiveness() {
+bool StackMapLiveness::calculateLiveness(MachineFunction &MF) {
   bool HasChanged = false;
   // For all basic blocks in the function.
-  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
-       MBBI != MBBE; ++MBBI) {
-    DEBUG(dbgs() << "****** BB " << MBBI->getName() << " ******\n");
+  for (auto &MBB : MF) {
+    DEBUG(dbgs() << "****** BB " << MBB.getName() << " ******\n");
     LiveRegs.init(TRI);
-    LiveRegs.addLiveOuts(MBBI);
+    LiveRegs.addLiveOuts(&MBB);
     bool HasStackMap = false;
     // Reverse iterate over all instructions and add the current live register
     // set to an instruction if we encounter a patchpoint instruction.
-    for (MachineBasicBlock::reverse_iterator I = MBBI->rbegin(),
-         E = MBBI->rend(); I != E; ++I) {
+    for (auto I = MBB.rbegin(), E = MBB.rend(); I != E; ++I) {
       if (I->getOpcode() == TargetOpcode::PATCHPOINT) {
-        addLiveOutSetToMI(*I);
+        addLiveOutSetToMI(MF, *I);
         HasChanged = true;
         HasStackMap = true;
         ++NumStackMaps;
@@ -149,21 +144,23 @@ bool StackMapLiveness::calculateLiveness() {
 }
 
 /// Add the current register live set to the instruction.
-void StackMapLiveness::addLiveOutSetToMI(MachineInstr &MI) {
-  uint32_t *Mask = createRegisterMask();
+void StackMapLiveness::addLiveOutSetToMI(MachineFunction &MF,
+                                         MachineInstr &MI) {
+  uint32_t *Mask = createRegisterMask(MF);
   MachineOperand MO = MachineOperand::CreateRegLiveOut(Mask);
-  MI.addOperand(*MF, MO);
+  MI.addOperand(MF, MO);
 }
 
 /// Create a register mask and initialize it with the registers from the
 /// register live set.
-uint32_t *StackMapLiveness::createRegisterMask() const {
+uint32_t *StackMapLiveness::createRegisterMask(MachineFunction &MF) const {
   // The mask is owned and cleaned up by the Machine Function.
-  uint32_t *Mask = MF->allocateRegisterMask(TRI->getNumRegs());
-  for (LivePhysRegs::const_iterator RI = LiveRegs.begin(), RE = LiveRegs.end();
-       RI != RE; ++RI)
-    Mask[*RI / 32] |= 1U << (*RI % 32);
+  uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs());
+  for (auto Reg : LiveRegs)
+    Mask[Reg / 32] |= 1U << (Reg % 32);
 
+  // Give the target a chance to adjust the mask.
   TRI->adjustStackMapLiveOutMask(Mask);
+
   return Mask;
 }
diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
index 1e8e03f9a7df..116eef66c580 100644
--- a/lib/CodeGen/StackMaps.cpp
+++ b/lib/CodeGen/StackMaps.cpp
@@ -29,17 +29,17 @@ using namespace llvm;
 
 #define DEBUG_TYPE "stackmaps"
 
-static cl::opt<int> StackMapVersion("stackmap-version", cl::init(1),
-  cl::desc("Specify the stackmap encoding version (default = 1)"));
+static cl::opt<int> StackMapVersion(
+    "stackmap-version", cl::init(1),
+    cl::desc("Specify the stackmap encoding version (default = 1)"));
 
 const char *StackMaps::WSMP = "Stack Maps: ";
 
 PatchPointOpers::PatchPointOpers(const MachineInstr *MI)
-  : MI(MI),
-    HasDef(MI->getOperand(0).isReg() && MI->getOperand(0).isDef() &&
-           !MI->getOperand(0).isImplicit()),
-    IsAnyReg(MI->getOperand(getMetaIdx(CCPos)).getImm() == CallingConv::AnyReg)
-{
+    : MI(MI), HasDef(MI->getOperand(0).isReg() && MI->getOperand(0).isDef() &&
+                     !MI->getOperand(0).isImplicit()),
+      IsAnyReg(MI->getOperand(getMetaIdx(CCPos)).getImm() ==
+               CallingConv::AnyReg) {
 #ifndef NDEBUG
   unsigned CheckStartIdx = 0, e = MI->getNumOperands();
   while (CheckStartIdx < e && MI->getOperand(CheckStartIdx).isReg() &&
@@ -76,30 +76,31 @@ StackMaps::StackMaps(AsmPrinter &AP) : AP(AP) {
 
 /// Go up the super-register chain until we hit a valid dwarf register number.
 static unsigned getDwarfRegNum(unsigned Reg, const TargetRegisterInfo *TRI) {
-  int RegNo = TRI->getDwarfRegNum(Reg, false);
-  for (MCSuperRegIterator SR(Reg, TRI); SR.isValid() && RegNo < 0; ++SR)
-    RegNo = TRI->getDwarfRegNum(*SR, false);
+  int RegNum = TRI->getDwarfRegNum(Reg, false);
+  for (MCSuperRegIterator SR(Reg, TRI); SR.isValid() && RegNum < 0; ++SR)
+    RegNum = TRI->getDwarfRegNum(*SR, false);
 
-  assert(RegNo >= 0 && "Invalid Dwarf register number.");
-  return (unsigned) RegNo;
+  assert(RegNum >= 0 && "Invalid Dwarf register number.");
+  return (unsigned)RegNum;
 }
 
 MachineInstr::const_mop_iterator
 StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
-                        MachineInstr::const_mop_iterator MOE,
-                        LocationVec &Locs, LiveOutVec &LiveOuts) const {
+                        MachineInstr::const_mop_iterator MOE, LocationVec &Locs,
+                        LiveOutVec &LiveOuts) const {
   const TargetRegisterInfo *TRI = AP.MF->getSubtarget().getRegisterInfo();
   if (MOI->isImm()) {
     switch (MOI->getImm()) {
-    default: llvm_unreachable("Unrecognized operand type.");
+    default:
+      llvm_unreachable("Unrecognized operand type.");
     case StackMaps::DirectMemRefOp: {
       unsigned Size = AP.TM.getDataLayout()->getPointerSizeInBits();
       assert((Size % 8) == 0 && "Need pointer size in bytes.");
       Size /= 8;
       unsigned Reg = (++MOI)->getReg();
       int64_t Imm = (++MOI)->getImm();
-      Locs.push_back(Location(StackMaps::Location::Direct, Size,
-                              getDwarfRegNum(Reg, TRI), Imm));
+      Locs.emplace_back(StackMaps::Location::Direct, Size,
+                        getDwarfRegNum(Reg, TRI), Imm);
       break;
     }
     case StackMaps::IndirectMemRefOp: {
@@ -107,15 +108,15 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
       assert(Size > 0 && "Need a valid size for indirect memory locations.");
       unsigned Reg = (++MOI)->getReg();
       int64_t Imm = (++MOI)->getImm();
-      Locs.push_back(Location(StackMaps::Location::Indirect, Size,
-                              getDwarfRegNum(Reg, TRI), Imm));
+      Locs.emplace_back(StackMaps::Location::Indirect, Size,
+                        getDwarfRegNum(Reg, TRI), Imm);
       break;
     }
     case StackMaps::ConstantOp: {
       ++MOI;
       assert(MOI->isImm() && "Expected constant operand.");
       int64_t Imm = MOI->getImm();
-      Locs.push_back(Location(Location::Constant, sizeof(int64_t), 0, Imm));
+      Locs.emplace_back(Location::Constant, sizeof(int64_t), 0, Imm);
       break;
     }
     }
@@ -137,14 +138,13 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
     assert(!MOI->getSubReg() && "Physical subreg still around.");
 
     unsigned Offset = 0;
-    unsigned RegNo = getDwarfRegNum(MOI->getReg(), TRI);
-    unsigned LLVMRegNo = TRI->getLLVMRegNum(RegNo, false);
-    unsigned SubRegIdx = TRI->getSubRegIndex(LLVMRegNo, MOI->getReg());
+    unsigned DwarfRegNum = getDwarfRegNum(MOI->getReg(), TRI);
+    unsigned LLVMRegNum = TRI->getLLVMRegNum(DwarfRegNum, false);
+    unsigned SubRegIdx = TRI->getSubRegIndex(LLVMRegNum, MOI->getReg());
     if (SubRegIdx)
       Offset = TRI->getSubRegIdxOffset(SubRegIdx);
 
-    Locs.push_back(
-      Location(Location::Register, RC->getSize(), RegNo, Offset));
+    Locs.emplace_back(Location::Register, RC->getSize(), DwarfRegNum, Offset);
     return ++MOI;
   }
 
@@ -165,19 +165,19 @@ void StackMaps::print(raw_ostream &OS) {
     OS << WSMP << "callsite " << CSI.ID << "\n";
     OS << WSMP << "  has " << CSLocs.size() << " locations\n";
 
-    unsigned OperIdx = 0;
+    unsigned Idx = 0;
     for (const auto &Loc : CSLocs) {
-      OS << WSMP << "  Loc " << OperIdx << ": ";
-      switch (Loc.LocType) {
+      OS << WSMP << "\t\tLoc " << Idx << ": ";
+      switch (Loc.Type) {
       case Location::Unprocessed:
         OS << "<Unprocessed operand>";
         break;
       case Location::Register:
         OS << "Register ";
-	if (TRI)
-	  OS << TRI->getName(Loc.Reg);
-	else
-	  OS << Loc.Reg;
+        if (TRI)
+          OS << TRI->getName(Loc.Reg);
+        else
+          OS << Loc.Reg;
         break;
       case Location::Direct:
         OS << "Direct ";
@@ -203,23 +203,23 @@ void StackMaps::print(raw_ostream &OS) {
         OS << "Constant Index " << Loc.Offset;
         break;
       }
-      OS << "     [encoding: .byte " << Loc.LocType << ", .byte " << Loc.Size
+      OS << "\t[encoding: .byte " << Loc.Type << ", .byte " << Loc.Size
          << ", .short " << Loc.Reg << ", .int " << Loc.Offset << "]\n";
-      OperIdx++;
+      Idx++;
     }
 
-    OS << WSMP << "  has " << LiveOuts.size() << " live-out registers\n";
+    OS << WSMP << "\thas " << LiveOuts.size() << " live-out registers\n";
 
-    OperIdx = 0;
+    Idx = 0;
     for (const auto &LO : LiveOuts) {
-      OS << WSMP << "  LO " << OperIdx << ": ";
+      OS << WSMP << "\t\tLO " << Idx << ": ";
       if (TRI)
         OS << TRI->getName(LO.Reg);
       else
         OS << LO.Reg;
-      OS << "      [encoding: .short " << LO.RegNo << ", .byte 0, .byte "
+      OS << "\t[encoding: .short " << LO.DwarfRegNum << ", .byte 0, .byte "
          << LO.Size << "]\n";
-      OperIdx++;
+      Idx++;
     }
   }
 }
@@ -227,9 +227,9 @@ void StackMaps::print(raw_ostream &OS) {
 /// Create a live-out register record for the given register Reg.
 StackMaps::LiveOutReg
 StackMaps::createLiveOutReg(unsigned Reg, const TargetRegisterInfo *TRI) const {
-  unsigned RegNo = getDwarfRegNum(Reg, TRI);
+  unsigned DwarfRegNum = getDwarfRegNum(Reg, TRI);
   unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
-  return LiveOutReg(Reg, RegNo, Size);
+  return LiveOutReg(Reg, DwarfRegNum, Size);
 }
 
 /// Parse the register live-out mask and return a vector of live-out registers
@@ -248,11 +248,16 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {
   // We don't need to keep track of a register if its super-register is already
   // in the list. Merge entries that refer to the same dwarf register and use
   // the maximum size that needs to be spilled.
-  std::sort(LiveOuts.begin(), LiveOuts.end());
-  for (LiveOutVec::iterator I = LiveOuts.begin(), E = LiveOuts.end();
-       I != E; ++I) {
-    for (LiveOutVec::iterator II = std::next(I); II != E; ++II) {
-      if (I->RegNo != II->RegNo) {
+
+  std::sort(LiveOuts.begin(), LiveOuts.end(),
+            [](const LiveOutReg &LHS, const LiveOutReg &RHS) {
+              // Only sort by the dwarf register number.
+              return LHS.DwarfRegNum < RHS.DwarfRegNum;
+            });
+
+  for (auto I = LiveOuts.begin(), E = LiveOuts.end(); I != E; ++I) {
+    for (auto II = std::next(I); II != E; ++II) {
+      if (I->DwarfRegNum != II->DwarfRegNum) {
         // Skip all the now invalid entries.
         I = --II;
         break;
@@ -260,11 +265,15 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {
       I->Size = std::max(I->Size, II->Size);
       if (TRI->isSuperRegister(I->Reg, II->Reg))
         I->Reg = II->Reg;
-      II->MarkInvalid();
+      II->Reg = 0; // mark for deletion.
     }
   }
-  LiveOuts.erase(std::remove_if(LiveOuts.begin(), LiveOuts.end(),
-                                LiveOutReg::IsInvalid), LiveOuts.end());
+
+  LiveOuts.erase(
+      std::remove_if(LiveOuts.begin(), LiveOuts.end(),
+                     [](const LiveOutReg &LO) { return LO.Reg == 0; }),
+      LiveOuts.end());
+
   return LiveOuts;
 }
 
@@ -282,8 +291,8 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID,
 
   if (recordResult) {
     assert(PatchPointOpers(&MI).hasDef() && "Stackmap has no return value.");
-    parseOperand(MI.operands_begin(), std::next(MI.operands_begin()),
-                 Locations, LiveOuts);
+    parseOperand(MI.operands_begin(), std::next(MI.operands_begin()), Locations,
+                 LiveOuts);
   }
 
   // Parse operands.
@@ -292,33 +301,31 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID,
   }
 
   // Move large constants into the constant pool.
-  for (LocationVec::iterator I = Locations.begin(), E = Locations.end();
-       I != E; ++I) {
+  for (auto &Loc : Locations) {
     // Constants are encoded as sign-extended integers.
     // -1 is directly encoded as .long 0xFFFFFFFF with no constant pool.
-    if (I->LocType == Location::Constant && !isInt<32>(I->Offset)) {
-      I->LocType = Location::ConstantIndex;
+    if (Loc.Type == Location::Constant && !isInt<32>(Loc.Offset)) {
+      Loc.Type = Location::ConstantIndex;
       // ConstPool is intentionally a MapVector of 'uint64_t's (as
       // opposed to 'int64_t's).  We should never be in a situation
       // where we have to insert either the tombstone or the empty
       // keys into a map, and for a DenseMap<uint64_t, T> these are
       // (uint64_t)0 and (uint64_t)-1.  They can be and are
       // represented using 32 bit integers.
-
-      assert((uint64_t)I->Offset != DenseMapInfo<uint64_t>::getEmptyKey() &&
-             (uint64_t)I->Offset != DenseMapInfo<uint64_t>::getTombstoneKey() &&
+      assert((uint64_t)Loc.Offset != DenseMapInfo<uint64_t>::getEmptyKey() &&
+             (uint64_t)Loc.Offset !=
+                 DenseMapInfo<uint64_t>::getTombstoneKey() &&
              "empty and tombstone keys should fit in 32 bits!");
-      auto Result = ConstPool.insert(std::make_pair(I->Offset, I->Offset));
-      I->Offset = Result.first - ConstPool.begin();
+      auto Result = ConstPool.insert(std::make_pair(Loc.Offset, Loc.Offset));
+      Loc.Offset = Result.first - ConstPool.begin();
     }
   }
 
   // Create an expression to calculate the offset of the callsite from function
   // entry.
   const MCExpr *CSOffsetExpr = MCBinaryExpr::createSub(
-    MCSymbolRefExpr::create(MILabel, OutContext),
-    MCSymbolRefExpr::create(AP.CurrentFnSymForSize, OutContext),
-    OutContext);
+      MCSymbolRefExpr::create(MILabel, OutContext),
+      MCSymbolRefExpr::create(AP.CurrentFnSymForSize, OutContext), OutContext);
 
   CSInfos.emplace_back(CSOffsetExpr, ID, std::move(Locations),
                        std::move(LiveOuts));
@@ -326,10 +333,10 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID,
   // Record the stack size of the current function.
   const MachineFrameInfo *MFI = AP.MF->getFrameInfo();
   const TargetRegisterInfo *RegInfo = AP.MF->getSubtarget().getRegisterInfo();
-  const bool DynamicFrameSize = MFI->hasVarSizedObjects() ||
-    RegInfo->needsStackRealignment(*(AP.MF));
+  bool HasDynamicFrameSize =
+      MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(*(AP.MF));
   FnStackSize[AP.CurrentFnSym] =
-    DynamicFrameSize ? UINT64_MAX : MFI->getStackSize();
+      HasDynamicFrameSize ? UINT64_MAX : MFI->getStackSize();
 }
 
 void StackMaps::recordStackMap(const MachineInstr &MI) {
@@ -346,25 +353,23 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) {
   PatchPointOpers opers(&MI);
   int64_t ID = opers.getMetaOper(PatchPointOpers::IDPos).getImm();
 
-  MachineInstr::const_mop_iterator MOI =
-    std::next(MI.operands_begin(), opers.getStackMapStartIdx());
+  auto MOI = std::next(MI.operands_begin(), opers.getStackMapStartIdx());
   recordStackMapOpers(MI, ID, MOI, MI.operands_end(),
                       opers.isAnyReg() && opers.hasDef());
 
 #ifndef NDEBUG
   // verify anyregcc
-  LocationVec &Locations = CSInfos.back().Locations;
+  auto &Locations = CSInfos.back().Locations;
   if (opers.isAnyReg()) {
     unsigned NArgs = opers.getMetaOper(PatchPointOpers::NArgPos).getImm();
-    for (unsigned i = 0, e = (opers.hasDef() ? NArgs+1 : NArgs); i != e; ++i)
-      assert(Locations[i].LocType == Location::Register &&
+    for (unsigned i = 0, e = (opers.hasDef() ? NArgs + 1 : NArgs); i != e; ++i)
+      assert(Locations[i].Type == Location::Register &&
              "anyreg arg must be in reg.");
   }
 #endif
 }
 void StackMaps::recordStatepoint(const MachineInstr &MI) {
-  assert(MI.getOpcode() == TargetOpcode::STATEPOINT &&
-         "expected statepoint");
+  assert(MI.getOpcode() == TargetOpcode::STATEPOINT && "expected statepoint");
 
   StatepointOpers opers(&MI);
   // Record all the deopt and gc operands (they're contiguous and run from the
@@ -387,8 +392,8 @@ void StackMaps::recordStatepoint(const MachineInstr &MI) {
 void StackMaps::emitStackmapHeader(MCStreamer &OS) {
   // Header.
   OS.EmitIntValue(StackMapVersion, 1); // Version.
-  OS.EmitIntValue(0, 1); // Reserved.
-  OS.EmitIntValue(0, 2); // Reserved.
+  OS.EmitIntValue(0, 1);               // Reserved.
+  OS.EmitIntValue(0, 2);               // Reserved.
 
   // Num functions.
   DEBUG(dbgs() << WSMP << "#functions = " << FnStackSize.size() << '\n');
@@ -412,7 +417,7 @@ void StackMaps::emitFunctionFrameRecords(MCStreamer &OS) {
   DEBUG(dbgs() << WSMP << "functions:\n");
   for (auto const &FR : FnStackSize) {
     DEBUG(dbgs() << WSMP << "function addr: " << FR.first
-                         << " frame size: " << FR.second);
+                 << " frame size: " << FR.second);
     OS.EmitSymbolValue(FR.first, 8);
     OS.EmitIntValue(FR.second, 8);
   }
@@ -424,7 +429,7 @@ void StackMaps::emitFunctionFrameRecords(MCStreamer &OS) {
 void StackMaps::emitConstantPoolEntries(MCStreamer &OS) {
   // Constant pool entries.
   DEBUG(dbgs() << WSMP << "constants:\n");
-  for (auto ConstEntry : ConstPool) {
+  for (const auto &ConstEntry : ConstPool) {
     DEBUG(dbgs() << WSMP << ConstEntry.second << '\n');
     OS.EmitIntValue(ConstEntry.second, 8);
   }
@@ -489,7 +494,7 @@ void StackMaps::emitCallsiteEntries(MCStreamer &OS) {
     OS.EmitIntValue(CSLocs.size(), 2);
 
     for (const auto &Loc : CSLocs) {
-      OS.EmitIntValue(Loc.LocType, 1);
+      OS.EmitIntValue(Loc.Type, 1);
       OS.EmitIntValue(Loc.Size, 1);
       OS.EmitIntValue(Loc.Reg, 2);
       OS.EmitIntValue(Loc.Offset, 4);
@@ -500,7 +505,7 @@ void StackMaps::emitCallsiteEntries(MCStreamer &OS) {
     OS.EmitIntValue(LiveOuts.size(), 2);
 
     for (const auto &LO : LiveOuts) {
-      OS.EmitIntValue(LO.RegNo, 2);
+      OS.EmitIntValue(LO.DwarfRegNum, 2);
       OS.EmitIntValue(0, 1);
       OS.EmitIntValue(LO.Size, 1);
     }
@@ -511,7 +516,7 @@ void StackMaps::emitCallsiteEntries(MCStreamer &OS) {
 
 /// Serialize the stackmap data.
 void StackMaps::serializeToStackMapSection() {
-  (void) WSMP;
+  (void)WSMP;
   // Bail out if there's no stack map data.
   assert((!CSInfos.empty() || (CSInfos.empty() && ConstPool.empty())) &&
          "Expected empty constant pool too!");
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 0824d6f91db0..bcea37a3aafa 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -122,7 +122,7 @@ bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge,
 
     // If an array has more than SSPBufferSize bytes of allocated space, then we
     // emit stack protectors.
-    if (SSPBufferSize <= TLI->getDataLayout()->getTypeAllocSize(AT)) {
+    if (SSPBufferSize <= M->getDataLayout().getTypeAllocSize(AT)) {
       IsLarge = true;
       return true;
     }
diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 56383247eadb..f3cccd82a5c5 100644
--- a/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -11,9 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -54,3 +57,30 @@ bool TargetFrameLowering::needsFrameIndexResolution(
     const MachineFunction &MF) const {
   return MF.getFrameInfo()->hasStackObjects();
 }
+
+void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                               BitVector &SavedRegs,
+                                               RegScavenger *RS) const {
+  // Get the callee saved register list...
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
+
+  // Early exit if there are no callee saved registers.
+  if (!CSRegs || CSRegs[0] == 0)
+    return;
+
+  SavedRegs.resize(TRI.getNumRegs());
+
+  // In Naked functions we aren't going to save any registers.
+  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+    return;
+
+  // Functions which call __builtin_unwind_init get all their registers saved.
+  bool CallsUnwindInit = MF.getMMI().callsUnwindInit();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    if (CallsUnwindInit || MRI.isPhysRegModified(Reg))
+      SavedRegs.set(Reg);
+  }
+}
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 78492a6e8818..ecfd65931574 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -750,7 +750,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   initActions();
 
   // Perform these initializations only once.
-  IsLittleEndian = getDataLayout()->isLittleEndian();
   MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
   MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
     = MaxStoresPerMemmoveOptSize = 4;
@@ -879,28 +878,17 @@ void TargetLoweringBase::initActions() {
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
 }
 
-MVT TargetLoweringBase::getPointerTy(uint32_t AS) const {
-  return MVT::getIntegerVT(getPointerSizeInBits(AS));
+MVT TargetLoweringBase::getScalarShiftAmountTy(const DataLayout &DL,
+                                               EVT) const {
+  return MVT::getIntegerVT(8 * DL.getPointerSize(0));
 }
 
-unsigned TargetLoweringBase::getPointerSizeInBits(uint32_t AS) const {
-  return getDataLayout()->getPointerSizeInBits(AS);
-}
-
-unsigned TargetLoweringBase::getPointerTypeSizeInBits(Type *Ty) const {
-  assert(Ty->isPointerTy());
-  return getPointerSizeInBits(Ty->getPointerAddressSpace());
-}
-
-MVT TargetLoweringBase::getScalarShiftAmountTy(EVT LHSTy) const {
-  return MVT::getIntegerVT(8 * getDataLayout()->getPointerSize(0));
-}
-
-EVT TargetLoweringBase::getShiftAmountTy(EVT LHSTy) const {
+EVT TargetLoweringBase::getShiftAmountTy(EVT LHSTy,
+                                         const DataLayout &DL) const {
   assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
   if (LHSTy.isVector())
     return LHSTy;
-  return getScalarShiftAmountTy(LHSTy);
+  return getScalarShiftAmountTy(DL, LHSTy);
 }
 
 /// canOpTrap - Returns true if the operation can trap for the value type.
@@ -1398,9 +1386,10 @@ void TargetLoweringBase::computeRegisterProperties(
   }
 }
 
-EVT TargetLoweringBase::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT TargetLoweringBase::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                           EVT VT) const {
   assert(!VT.isVector() && "No default SetCC type for vectors!");
-  return getPointerTy(0).SimpleTy;
+  return getPointerTy(DL).SimpleTy;
 }
 
 MVT::SimpleValueType TargetLoweringBase::getCmpLibcallReturnType() const {
@@ -1485,11 +1474,11 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
 /// type of the given function.  This does not require a DAG or a return value,
 /// and is suitable for use before any DAGs for the function are constructed.
 /// TODO: Move this out of TargetLowering.cpp.
-void llvm::GetReturnInfo(Type* ReturnType, AttributeSet attr,
+void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,
                          SmallVectorImpl<ISD::OutputArg> &Outs,
-                         const TargetLowering &TLI) {
+                         const TargetLowering &TLI, const DataLayout &DL) {
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, ReturnType, ValueVTs);
+  ComputeValueVTs(TLI, DL, ReturnType, ValueVTs);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0) return;
 
@@ -1534,8 +1523,9 @@ void llvm::GetReturnInfo(Type* ReturnType, AttributeSet attr,
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.  This is the actual
 /// alignment, not its logarithm.
-unsigned TargetLoweringBase::getByValTypeAlignment(Type *Ty) const {
-  return getDataLayout()->getABITypeAlignment(Ty);
+unsigned TargetLoweringBase::getByValTypeAlignment(Type *Ty,
+                                                   const DataLayout &DL) const {
+  return DL.getABITypeAlignment(Ty);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1614,9 +1604,10 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
 }
 
 std::pair<unsigned, MVT>
-TargetLoweringBase::getTypeLegalizationCost(Type *Ty) const {
+TargetLoweringBase::getTypeLegalizationCost(const DataLayout &DL,
+                                            Type *Ty) const {
   LLVMContext &C = Ty->getContext();
-  EVT MTy = getValueType(Ty);
+  EVT MTy = getValueType(DL, Ty);
 
   unsigned Cost = 1;
   // We keep legalizing the type until we find a legal kind. We assume that
@@ -1642,8 +1633,8 @@ TargetLoweringBase::getTypeLegalizationCost(Type *Ty) const {
 
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
-bool TargetLoweringBase::isLegalAddressingMode(const AddrMode &AM,
-                                               Type *Ty,
+bool TargetLoweringBase::isLegalAddressingMode(const DataLayout &DL,
+                                               const AddrMode &AM, Type *Ty,
                                                unsigned AS) const {
   // The default implementation of this implements a conservative RISCy, r+r and
   // r+i addr mode.
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index e84bea63995e..1e30821dc741 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1215,11 +1215,11 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
   //   addl	%esi, %edi
   //   movl	%edi, %eax
   //   ret
-  bool commuted = false;
+  bool Commuted = false;
 
   // If it's profitable to commute, try to do so.
   if (TryCommute && commuteInstruction(mi, regB, regC, Dist)) {
-    commuted = true;
+    Commuted = true;
     ++NumCommuted;
     if (AggressiveCommute)
       ++NumAggrCommuted;
@@ -1232,7 +1232,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
 
   // If there is one more use of regB later in the same MBB, consider
   // re-schedule this MI below it.
-  if (!commuted && EnableRescheduling && rescheduleMIBelowKill(mi, nmi, regB)) {
+  if (!Commuted && EnableRescheduling && rescheduleMIBelowKill(mi, nmi, regB)) {
     ++NumReSchedDowns;
     return true;
   }
@@ -1250,7 +1250,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
   }
 
   // Return if it is commuted but 3 addr conversion is failed.
-  if (commuted)
+  if (Commuted)
     return false;
 
   // If there is one more use of regB later in the same MBB, consider
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 2912bdd63426..02341b4d66b8 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -163,7 +163,6 @@ class VirtRegRewriter : public MachineFunctionPass {
   SlotIndexes *Indexes;
   LiveIntervals *LIS;
   VirtRegMap *VRM;
-  SparseSet<unsigned> PhysRegs;
 
   void rewrite();
   void addMBBLiveIns();
@@ -319,54 +318,15 @@ void VirtRegRewriter::rewrite() {
   SmallVector<unsigned, 8> SuperDeads;
   SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
-  SmallPtrSet<const MachineInstr *, 4> NoReturnInsts;
-
-  // Here we have a SparseSet to hold which PhysRegs are actually encountered
-  // in the MF we are about to iterate over so that later when we call
-  // setPhysRegUsed, we are only doing it for physRegs that were actually found
-  // in the program and not for all of the possible physRegs for the given
-  // target architecture. If the target has a lot of physRegs, then for a small
-  // program there will be a significant compile time reduction here.
-  PhysRegs.clear();
-  PhysRegs.setUniverse(TRI->getNumRegs());
-
-  // The function with uwtable should guarantee that the stack unwinder
-  // can unwind the stack to the previous frame.  Thus, we can't apply the
-  // noreturn optimization if the caller function has uwtable attribute.
-  bool HasUWTable = MF->getFunction()->hasFnAttribute(Attribute::UWTable);
 
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
     DEBUG(MBBI->print(dbgs(), Indexes));
-    bool IsExitBB = MBBI->succ_empty();
     for (MachineBasicBlock::instr_iterator
            MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) {
       MachineInstr *MI = MII;
       ++MII;
 
-      // Check if this instruction is a call to a noreturn function.  If this
-      // is a call to noreturn function and we don't need the stack unwinding
-      // functionality (i.e. this function does not have uwtable attribute and
-      // the callee function has the nounwind attribute), then we can ignore
-      // the definitions set by this instruction.
-      if (!HasUWTable && IsExitBB && MI->isCall()) {
-        for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-               MOE = MI->operands_end(); MOI != MOE; ++MOI) {
-          MachineOperand &MO = *MOI;
-          if (!MO.isGlobal())
-            continue;
-          const Function *Func = dyn_cast<Function>(MO.getGlobal());
-          if (!Func || !Func->hasFnAttribute(Attribute::NoReturn) ||
-              // We need to keep correct unwind information
-              // even if the function will not return, since the
-              // runtime may need it.
-              !Func->hasFnAttribute(Attribute::NoUnwind))
-            continue;
-          NoReturnInsts.insert(MI);
-          break;
-        }
-      }
-
       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
            MOE = MI->operands_end(); MOI != MOE; ++MOI) {
         MachineOperand &MO = *MOI;
@@ -375,15 +335,6 @@ void VirtRegRewriter::rewrite() {
         if (MO.isRegMask())
           MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
 
-        // If we encounter a VirtReg or PhysReg then get at the PhysReg and add
-        // it to the physreg bitset.  Later we use only the PhysRegs that were
-        // actually encountered in the MF to populate the MRI's used physregs.
-        if (MO.isReg() && MO.getReg())
-          PhysRegs.insert(
-              TargetRegisterInfo::isVirtualRegister(MO.getReg()) ?
-              VRM->getPhys(MO.getReg()) :
-              MO.getReg());
-
         if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
           continue;
         unsigned VirtReg = MO.getReg();
@@ -470,29 +421,5 @@ void VirtRegRewriter::rewrite() {
       }
     }
   }
-
-  // Tell MRI about physical registers in use.
-  if (NoReturnInsts.empty()) {
-    for (SparseSet<unsigned>::iterator
-        RegI = PhysRegs.begin(), E = PhysRegs.end(); RegI != E; ++RegI)
-      if (!MRI->reg_nodbg_empty(*RegI))
-        MRI->setPhysRegUsed(*RegI);
-  } else {
-    for (SparseSet<unsigned>::iterator
-        I = PhysRegs.begin(), E = PhysRegs.end(); I != E; ++I) {
-      unsigned Reg = *I;
-      if (MRI->reg_nodbg_empty(Reg))
-        continue;
-      // Check if this register has a use that will impact the rest of the
-      // code. Uses in debug and noreturn instructions do not impact the
-      // generated code.
-      for (MachineInstr &It : MRI->reg_nodbg_instructions(Reg)) {
-        if (!NoReturnInsts.count(&It)) {
-          MRI->setPhysRegUsed(Reg);
-          break;
-        }
-      }
-    }
-  }
 }
 
diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
index dbc0d91a01e2..0d26ed333ca7 100644
--- a/lib/CodeGen/WinEHPrepare.cpp
+++ b/lib/CodeGen/WinEHPrepare.cpp
@@ -155,7 +155,7 @@ private:
   // outlined but before the outlined code is pruned from the parent function.
   DenseMap<const BasicBlock *, BasicBlock *> LPadTargetBlocks;
 
-  // Map from outlined handler to call to llvm.frameaddress(1). Only used for
+  // Map from outlined handler to call to parent local address. Only used for
   // 32-bit EH.
   DenseMap<Function *, Value *> HandlerToParentFP;
 
@@ -533,9 +533,9 @@ void WinEHPrepare::findSEHEHReturnPoints(
     BasicBlock *NextBB;
     Constant *Selector;
     if (isSelectorDispatch(BB, CatchHandler, Selector, NextBB)) {
-      // Split the edge if there is a phi node. Returning from EH to a phi node
-      // is just as impossible as having a phi after an indirectbr.
-      if (isa<PHINode>(CatchHandler->begin())) {
+      // Split the edge if there are multiple predecessors. This creates a place
+      // where we can insert EH recovery code.
+      if (!CatchHandler->getSinglePredecessor()) {
         DEBUG(dbgs() << "splitting EH return edge from " << BB->getName()
                      << " to " << CatchHandler->getName() << '\n');
         BBI = CatchHandler = SplitCriticalEdge(
@@ -616,6 +616,26 @@ void WinEHPrepare::demoteValuesLiveAcrossHandlers(
   // identifyEHBlocks() should have been called before this function.
   assert(!NormalBlocks.empty());
 
+  // Try to avoid demoting EH pointer and selector values. They get in the way
+  // of our pattern matching.
+  SmallPtrSet<Instruction *, 10> EHVals;
+  for (BasicBlock &BB : F) {
+    LandingPadInst *LP = BB.getLandingPadInst();
+    if (!LP)
+      continue;
+    EHVals.insert(LP);
+    for (User *U : LP->users()) {
+      auto *EI = dyn_cast<ExtractValueInst>(U);
+      if (!EI)
+        continue;
+      EHVals.insert(EI);
+      for (User *U2 : EI->users()) {
+        if (auto *PN = dyn_cast<PHINode>(U2))
+          EHVals.insert(PN);
+      }
+    }
+  }
+
   SetVector<Argument *> ArgsToDemote;
   SetVector<Instruction *> InstrsToDemote;
   for (BasicBlock &BB : F) {
@@ -641,7 +661,11 @@ void WinEHPrepare::demoteValuesLiveAcrossHandlers(
           continue;
         }
 
+        // Don't demote EH values.
         auto *OpI = cast<Instruction>(Op);
+        if (EHVals.count(OpI))
+          continue;
+
         BasicBlock *OpBB = OpI->getParent();
         // If a value is produced and consumed in the same BB, we don't need to
         // demote it.
@@ -822,7 +846,8 @@ bool WinEHPrepare::prepareExceptionHandlers(
     LPad->replaceAllUsesWith(UndefValue::get(LPad->getType()));
 
     // Rewrite uses of the exception pointer to loads of an alloca.
-    for (Instruction *E : SEHCodeUses) {
+    while (!SEHCodeUses.empty()) {
+      Instruction *E = SEHCodeUses.pop_back_val();
       SmallVector<Use *, 4> Uses;
       for (Use &U : E->uses())
         Uses.push_back(&U);
@@ -830,13 +855,10 @@ bool WinEHPrepare::prepareExceptionHandlers(
         auto *I = cast<Instruction>(U->getUser());
         if (isa<ResumeInst>(I))
           continue;
-        LoadInst *LI;
         if (auto *Phi = dyn_cast<PHINode>(I))
-          LI = new LoadInst(SEHExceptionCodeSlot, "sehcode", false,
-                            Phi->getIncomingBlock(*U));
+          SEHCodeUses.push_back(Phi);
         else
-          LI = new LoadInst(SEHExceptionCodeSlot, "sehcode", false, I);
-        U->set(LI);
+          U->set(new LoadInst(SEHExceptionCodeSlot, "sehcode", false, I));
       }
       E->replaceAllUsesWith(UndefValue::get(E->getType()));
       E->eraseFromParent();
@@ -953,16 +975,16 @@ bool WinEHPrepare::prepareExceptionHandlers(
   Builder.SetInsertPoint(Entry->getFirstInsertionPt());
 
   Function *FrameEscapeFn =
-      Intrinsic::getDeclaration(M, Intrinsic::frameescape);
+      Intrinsic::getDeclaration(M, Intrinsic::localescape);
   Function *RecoverFrameFn =
-      Intrinsic::getDeclaration(M, Intrinsic::framerecover);
+      Intrinsic::getDeclaration(M, Intrinsic::localrecover);
   SmallVector<Value *, 8> AllocasToEscape;
 
-  // Scan the entry block for an existing call to llvm.frameescape. We need to
+  // Scan the entry block for an existing call to llvm.localescape. We need to
   // keep escaping those objects.
   for (Instruction &I : F.front()) {
     auto *II = dyn_cast<IntrinsicInst>(&I);
-    if (II && II->getIntrinsicID() == Intrinsic::frameescape) {
+    if (II && II->getIntrinsicID() == Intrinsic::localescape) {
       auto Args = II->arg_operands();
       AllocasToEscape.append(Args.begin(), Args.end());
       II->eraseFromParent();
@@ -971,7 +993,7 @@ bool WinEHPrepare::prepareExceptionHandlers(
   }
 
   // Finally, replace all of the temporary allocas for frame variables used in
-  // the outlined handlers with calls to llvm.framerecover.
+  // the outlined handlers with calls to llvm.localrecover.
   for (auto &VarInfoEntry : FrameVarInfo) {
     Value *ParentVal = VarInfoEntry.first;
     TinyPtrVector<AllocaInst *> &Allocas = VarInfoEntry.second;
@@ -992,7 +1014,7 @@ bool WinEHPrepare::prepareExceptionHandlers(
       llvm::Value *FP = HandlerToParentFP[HandlerFn];
       assert(FP);
 
-      // FIXME: Sink this framerecover into the blocks where it is used.
+      // FIXME: Sink this localrecover into the blocks where it is used.
       Builder.SetInsertPoint(TempAlloca);
       Builder.SetCurrentDebugLocation(TempAlloca->getDebugLoc());
       Value *RecoverArgs[] = {
@@ -1014,7 +1036,7 @@ bool WinEHPrepare::prepareExceptionHandlers(
     }
   } // End for each FrameVarInfo entry.
 
-  // Insert 'call void (...)* @llvm.frameescape(...)' at the end of the entry
+  // Insert 'call void (...)* @llvm.localescape(...)' at the end of the entry
   // block.
   Builder.SetInsertPoint(&F.getEntryBlock().back());
   Builder.CreateCall(FrameEscapeFn, AllocasToEscape);
@@ -1595,9 +1617,8 @@ void LandingPadMap::remapEHValues(ValueToValueMapTy &VMap, Value *EHPtrValue,
     VMap[Extract] = SelectorValue;
 }
 
-static bool isFrameAddressCall(const Value *V) {
-  return match(const_cast<Value *>(V),
-               m_Intrinsic<Intrinsic::frameaddress>(m_SpecificInt(0)));
+static bool isLocalAddressCall(const Value *V) {
+  return match(const_cast<Value *>(V), m_Intrinsic<Intrinsic::localaddress>());
 }
 
 CloningDirector::CloningAction WinEHCloningDirectorBase::handleInstruction(
@@ -1639,9 +1660,9 @@ CloningDirector::CloningAction WinEHCloningDirectorBase::handleInstruction(
   if (match(Inst, m_Intrinsic<Intrinsic::eh_typeid_for>()))
     return handleTypeIdFor(VMap, Inst, NewBB);
 
-  // When outlining llvm.frameaddress(i32 0), remap that to the second argument,
+  // When outlining llvm.localaddress(), remap that to the second argument,
   // which is the FP of the parent.
-  if (isFrameAddressCall(Inst)) {
+  if (isLocalAddressCall(Inst)) {
     VMap[Inst] = ParentFP;
     return CloningDirector::SkipInstruction;
   }
@@ -1961,7 +1982,7 @@ Value *WinEHFrameVariableMaterializer::materializeValueFor(Value *V) {
   // If we're asked to materialize a static alloca, we temporarily create an
   // alloca in the outlined function and add this to the FrameVarInfo map.  When
   // all the outlining is complete, we'll replace these temporary allocas with
-  // calls to llvm.framerecover.
+  // calls to llvm.localrecover.
   if (auto *AV = dyn_cast<AllocaInst>(V)) {
     assert(AV->isStaticAlloca() &&
            "cannot materialize un-demoted dynamic alloca");
@@ -1991,7 +2012,7 @@ void WinEHFrameVariableMaterializer::escapeCatchObject(Value *V) {
   // of a catch parameter, add a sentinel to the multimap to indicate that it's
   // used from another handler. This will prevent us from trying to sink the
   // alloca into the handler and ensure that the catch parameter is present in
-  // the call to llvm.frameescape.
+  // the call to llvm.localescape.
   FrameVarInfo[V].push_back(getCatchObjectSentinel());
 }
 
@@ -2233,16 +2254,16 @@ static void createCleanupHandler(LandingPadActions &Actions,
 static CallSite matchOutlinedFinallyCall(BasicBlock *BB,
                                          Instruction *MaybeCall) {
   // Look for finally blocks that Clang has already outlined for us.
-  //   %fp = call i8* @llvm.frameaddress(i32 0)
+  //   %fp = call i8* @llvm.localaddress()
   //   call void @"fin$parent"(iN 1, i8* %fp)
-  if (isFrameAddressCall(MaybeCall) && MaybeCall != BB->getTerminator())
+  if (isLocalAddressCall(MaybeCall) && MaybeCall != BB->getTerminator())
     MaybeCall = MaybeCall->getNextNode();
   CallSite FinallyCall(MaybeCall);
   if (!FinallyCall || FinallyCall.arg_size() != 2)
     return CallSite();
   if (!match(FinallyCall.getArgument(0), m_SpecificInt(1)))
     return CallSite();
-  if (!isFrameAddressCall(FinallyCall.getArgument(1)))
+  if (!isLocalAddressCall(FinallyCall.getArgument(1)))
     return CallSite();
   return FinallyCall;
 }
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index c25ddad33b76..96bcf15e0af0 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -677,7 +677,13 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
         // First calculate the address of the symbol or section as it appears
         // in the objct file
         if (Sym != Obj.symbol_end()) {
-          Sym->getAddress(SymAddr);
+          ErrorOr<uint64_t> SymAddrOrErr = Sym->getAddress();
+          if (std::error_code EC = SymAddrOrErr.getError()) {
+            errs() << "error: failed to compute symbol address: "
+                   << EC.message() << '\n';
+            continue;
+          }
+          SymAddr = *SymAddrOrErr;
           // Also remember what section this symbol is in for later
           Sym->getSection(RSec);
         } else if (auto *MObj = dyn_cast<MachOObjectFile>(&Obj)) {
diff --git a/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt b/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
index 348308897dc4..331d2141b0e2 100644
--- a/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
+++ b/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
@@ -3,4 +3,6 @@ include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/.. )
 add_llvm_library(LLVMIntelJITEvents
   IntelJITEventListener.cpp
   jitprofiling.c
-  )
+
+  LINK_LIBS pthread ${CMAKE_DL_LIBS}
+)
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 907144007fdd..a131763193c0 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -113,63 +113,59 @@ void IntelJITEventListener::NotifyObjectEmitted(
     std::vector<LineNumberInfo> LineInfo;
     std::string SourceFileName;
 
-    if (Sym.getType() == SymbolRef::ST_Function) {
-      ErrorOr<StringRef> Name = Sym.getName();
-      if (!Name)
-        continue;
-
-      uint64_t Addr;
-      if (Sym.getAddress(Addr))
-        continue;
-      uint64_t Size = P.second;
-
-      // Record this address in a local vector
-      Functions.push_back((void*)Addr);
-
-      // Build the function loaded notification message
-      iJIT_Method_Load FunctionMessage =
-          FunctionDescToIntelJITFormat(*Wrapper, Name->data(), Addr, Size);
-      if (Context) {
-        DILineInfoTable  Lines = Context->getLineInfoForAddressRange(Addr, Size);
-        DILineInfoTable::iterator  Begin = Lines.begin();
-        DILineInfoTable::iterator  End = Lines.end();
-        for (DILineInfoTable::iterator It = Begin; It != End; ++It) {
-          LineInfo.push_back(DILineInfoToIntelJITFormat((uintptr_t)Addr,
-                                                        It->first,
-                                                        It->second));
-        }
-        if (LineInfo.size() == 0) {
-          FunctionMessage.source_file_name = 0;
-          FunctionMessage.line_number_size = 0;
-          FunctionMessage.line_number_table = 0;
-        } else {
-          // Source line information for the address range is provided as 
-          // a code offset for the start of the corresponding sub-range and
-          // a source line. JIT API treats offsets in LineNumberInfo structures
-          // as the end of the corresponding code region. The start of the code
-          // is taken from the previous element. Need to shift the elements.
-
-          LineNumberInfo last = LineInfo.back();
-          last.Offset = FunctionMessage.method_size;
-          LineInfo.push_back(last);
-          for (size_t i = LineInfo.size() - 2; i > 0; --i)
-            LineInfo[i].LineNumber = LineInfo[i - 1].LineNumber;
-
-          SourceFileName = Lines.front().second.FileName;
-          FunctionMessage.source_file_name = const_cast<char *>(SourceFileName.c_str());
-          FunctionMessage.line_number_size = LineInfo.size();
-          FunctionMessage.line_number_table = &*LineInfo.begin();
-        }
-      } else {
-        FunctionMessage.source_file_name = 0;
-        FunctionMessage.line_number_size = 0;
-        FunctionMessage.line_number_table = 0;
-      }
-
-      Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
-                                &FunctionMessage);
-      MethodIDs[(void*)Addr] = FunctionMessage.method_id;
+    if (Sym.getType() != SymbolRef::ST_Function)
+      continue;
+
+    ErrorOr<StringRef> Name = Sym.getName();
+    if (!Name)
+      continue;
+
+    ErrorOr<uint64_t> AddrOrErr = Sym.getAddress();
+    if (AddrOrErr.getError())
+      continue;
+    uint64_t Addr = *AddrOrErr;
+    uint64_t Size = P.second;
+
+    // Record this address in a local vector
+    Functions.push_back((void*)Addr);
+
+    // Build the function loaded notification message
+    iJIT_Method_Load FunctionMessage =
+      FunctionDescToIntelJITFormat(*Wrapper, Name->data(), Addr, Size);
+    DILineInfoTable Lines = Context->getLineInfoForAddressRange(Addr, Size);
+    DILineInfoTable::iterator Begin = Lines.begin();
+    DILineInfoTable::iterator End = Lines.end();
+    for (DILineInfoTable::iterator It = Begin; It != End; ++It) {
+      LineInfo.push_back(
+          DILineInfoToIntelJITFormat((uintptr_t)Addr, It->first, It->second));
     }
+    if (LineInfo.size() == 0) {
+      FunctionMessage.source_file_name = 0;
+      FunctionMessage.line_number_size = 0;
+      FunctionMessage.line_number_table = 0;
+    } else {
+      // Source line information for the address range is provided as
+      // a code offset for the start of the corresponding sub-range and
+      // a source line. JIT API treats offsets in LineNumberInfo structures
+      // as the end of the corresponding code region. The start of the code
+      // is taken from the previous element. Need to shift the elements.
+
+      LineNumberInfo last = LineInfo.back();
+      last.Offset = FunctionMessage.method_size;
+      LineInfo.push_back(last);
+      for (size_t i = LineInfo.size() - 2; i > 0; --i)
+        LineInfo[i].LineNumber = LineInfo[i - 1].LineNumber;
+
+      SourceFileName = Lines.front().second.FileName;
+      FunctionMessage.source_file_name =
+        const_cast<char *>(SourceFileName.c_str());
+      FunctionMessage.line_number_size = LineInfo.size();
+      FunctionMessage.line_number_table = &*LineInfo.begin();
+    }
+
+    Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
+                              &FunctionMessage);
+    MethodIDs[(void*)Addr] = FunctionMessage.method_id;
   }
 
   // To support object unload notification, we need to keep a list of
diff --git a/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt b/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
index 1247cbd94930..afea3ecccda4 100644
--- a/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
+++ b/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
@@ -21,4 +21,4 @@
 type = OptionalLibrary
 name = IntelJITEvents
 parent = ExecutionEngine
-required_libraries = Core DebugInfoDWARF Support
+required_libraries = Core DebugInfoDWARF Support Object ExecutionEngine
diff --git a/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt b/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt
index e30516eb3b01..7d5550046a56 100644
--- a/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt
+++ b/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt
@@ -21,3 +21,4 @@
 type = OptionalLibrary
 name = OProfileJIT
 parent = ExecutionEngine
+required_libraries = Support Object ExecutionEngine
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index b72033805269..324d07118704 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -88,24 +88,27 @@ void OProfileJITEventListener::NotifyObjectEmitted(
   // Use symbol info to iterate functions in the object.
   for (const std::pair<SymbolRef, uint64_t> &P : computeSymbolSizes(DebugObj)) {
     SymbolRef Sym = P.first;
-    if (Sym.getType() == SymbolRef::ST_Function) {
-      StringRef  Name;
-      uint64_t   Addr;
-      if (Sym.getName(Name))
-        continue;
-      if (Sym.getAddress(Addr))
-        continue;
-      uint64_t Size = P.second;
-
-      if (Wrapper->op_write_native_code(Name.data(), Addr, (void*)Addr, Size)
-                        == -1) {
-        DEBUG(dbgs() << "Failed to tell OProfile about native function "
-          << Name << " at ["
-          << (void*)Addr << "-" << ((char*)Addr + Size) << "]\n");
-        continue;
-      }
-      // TODO: support line number info (similar to IntelJITEventListener.cpp)
+    if (Sym.getType() != SymbolRef::ST_Function)
+      continue;
+
+    ErrorOr<StringRef> NameOrErr = Sym.getName();
+    if (NameOrErr.getError())
+      continue;
+    StringRef Name = *NameOrErr;
+    ErrorOr<uint64_t> AddrOrErr = Sym.getAddress();
+    if (AddrOrErr.getError())
+      continue;
+    uint64_t Addr = *AddrOrErr;
+    uint64_t Size = P.second;
+
+    if (Wrapper->op_write_native_code(Name.data(), Addr, (void *)Addr, Size) ==
+        -1) {
+      DEBUG(dbgs() << "Failed to tell OProfile about native function " << Name
+                   << " at [" << (void *)Addr << "-" << ((char *)Addr + Size)
+                   << "]\n");
+      continue;
     }
+    // TODO: support line number info (similar to IntelJITEventListener.cpp)
   }
 
   DebugObjects[Obj.getData().data()] = std::move(DebugObjOwner);
@@ -126,8 +129,10 @@ void OProfileJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) {
                          E = DebugObj.symbol_end();
          I != E; ++I) {
       if (I->getType() == SymbolRef::ST_Function) {
-        uint64_t   Addr;
-        if (I->getAddress(Addr)) continue;
+        ErrorOr<uint64_t> AddrOrErr = I->getAddress();
+        if (AddrOrErr.getError())
+          continue;
+        uint64_t Addr = *AddrOrErr;
 
         if (Wrapper->op_unload_native_code(Addr) == -1) {
           DEBUG(dbgs()
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index fa501824e04a..93287a3a4e71 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -113,28 +113,12 @@ void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
   llvm_unreachable("Attempting to remap address of unknown section!");
 }
 
-static std::error_code getOffset(const SymbolRef &Sym, uint64_t &Result) {
-  uint64_t Address;
-  if (std::error_code EC = Sym.getAddress(Address))
+static std::error_code getOffset(const SymbolRef &Sym, SectionRef Sec,
+                                 uint64_t &Result) {
+  ErrorOr<uint64_t> AddressOrErr = Sym.getAddress();
+  if (std::error_code EC = AddressOrErr.getError())
     return EC;
-
-  if (Address == UnknownAddress) {
-    Result = UnknownAddress;
-    return std::error_code();
-  }
-
-  const ObjectFile *Obj = Sym.getObject();
-  section_iterator SecI(Obj->section_begin());
-  if (std::error_code EC = Sym.getSection(SecI))
-    return EC;
-
-  if (SecI == Obj->section_end()) {
-    Result = UnknownAddress;
-    return std::error_code();
-  }
-
-  uint64_t SectionAddress = SecI->getAddress();
-  Result = Address - SectionAddress;
+  Result = *AddressOrErr - Sec.getAddress();
   return std::error_code();
 }
 
@@ -184,12 +168,12 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
         ErrorOr<StringRef> NameOrErr = I->getName();
         Check(NameOrErr.getError());
         StringRef Name = *NameOrErr;
-        uint64_t SectOffset;
-        Check(getOffset(*I, SectOffset));
         section_iterator SI = Obj.section_end();
         Check(I->getSection(SI));
         if (SI == Obj.section_end())
           continue;
+        uint64_t SectOffset;
+        Check(getOffset(*I, *SI, SectOffset));
         StringRef SectionData;
         Check(SI->getContents(SectionData));
         bool IsCode = SI->isText();
@@ -814,12 +798,16 @@ void RuntimeDyldImpl::resolveExternalSymbols() {
         report_fatal_error("Program used external function '" + Name +
                            "' which could not be resolved!");
 
-      DEBUG(dbgs() << "Resolving relocations Name: " << Name << "\t"
-                   << format("0x%lx", Addr) << "\n");
-      // This list may have been updated when we called getSymbolAddress, so
-      // don't change this code to get the list earlier.
-      RelocationList &Relocs = i->second;
-      resolveRelocationList(Relocs, Addr);
+      // If Resolver returned UINT64_MAX, the client wants to handle this symbol
+      // manually and we shouldn't resolve its relocations.
+      if (Addr != UINT64_MAX) {
+        DEBUG(dbgs() << "Resolving relocations Name: " << Name << "\t"
+                     << format("0x%lx", Addr) << "\n");
+        // This list may have been updated when we called getSymbolAddress, so
+        // don't change this code to get the list earlier.
+        RelocationList &Relocs = i->second;
+        resolveRelocationList(Relocs, Addr);
+      }
     }
 
     ExternalSymbolRelocations.erase(i);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index 957571b092da..ae199b720223 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -727,7 +727,9 @@ bool RuntimeDyldCheckerImpl::checkAllRulesInBuffer(StringRef RulePrefix,
 }
 
 bool RuntimeDyldCheckerImpl::isSymbolValid(StringRef Symbol) const {
-  return getRTDyld().getSymbolLocalAddress(Symbol) != nullptr;
+  if (getRTDyld().getSymbolLocalAddress(Symbol))
+    return true;
+  return !!getRTDyld().Resolver.findSymbol(Symbol);
 }
 
 uint64_t RuntimeDyldCheckerImpl::getSymbolLocalAddr(StringRef Symbol) const {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index f5069c005857..3787950b3b08 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -511,11 +511,54 @@ void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section,
     Insn |= Value & 0xffff;
     writeBytesUnaligned(Insn, TargetPtr, 4);
     break;
-  case ELF::R_MIPS_PC32:
+  case ELF::R_MIPS_PC32: {
+    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    writeBytesUnaligned(Value - FinalAddress, (uint8_t *)TargetPtr, 4);
+    break;
+  }
+  case ELF::R_MIPS_PC16: {
+    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    Insn &= 0xffff0000;
+    Insn |= ((Value - FinalAddress) >> 2) & 0xffff;
+    writeBytesUnaligned(Insn, TargetPtr, 4);
+    break;
+  }
+  case ELF::R_MIPS_PC19_S2: {
+    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    Insn &= 0xfff80000;
+    Insn |= ((Value - (FinalAddress & ~0x3)) >> 2) & 0x7ffff;
+    writeBytesUnaligned(Insn, TargetPtr, 4);
+    break;
+  }
+  case ELF::R_MIPS_PC21_S2: {
+    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    Insn &= 0xffe00000;
+    Insn |= ((Value - FinalAddress) >> 2) & 0x1fffff;
+    writeBytesUnaligned(Insn, TargetPtr, 4);
+    break;
+  }
+  case ELF::R_MIPS_PC26_S2: {
+    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    Insn &= 0xfc000000;
+    Insn |= ((Value - FinalAddress) >> 2) & 0x3ffffff;
+    writeBytesUnaligned(Insn, TargetPtr, 4);
+    break;
+  }
+  case ELF::R_MIPS_PCHI16: {
     uint32_t FinalAddress = (Section.LoadAddress + Offset);
-    writeBytesUnaligned(Value + Addend - FinalAddress, (uint8_t *)TargetPtr, 4);
+    Insn &= 0xffff0000;
+    Insn |= ((Value - FinalAddress + 0x8000) >> 16) & 0xffff;
+    writeBytesUnaligned(Insn, TargetPtr, 4);
     break;
   }
+  case ELF::R_MIPS_PCLO16: {
+    uint32_t FinalAddress = (Section.LoadAddress + Offset);
+    Insn &= 0xffff0000;
+    Insn |= (Value - FinalAddress) & 0xffff;
+    writeBytesUnaligned(Insn, TargetPtr, 4);
+    break;
+  }
+  }
 }
 
 void RuntimeDyldELF::setMipsABI(const ObjectFile &Obj) {
@@ -1263,12 +1306,24 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
         Section.StubOffset += getMaxStubSize();
       }
     } else {
-      if (RelType == ELF::R_MIPS_HI16)
+      // FIXME: Calculate correct addends for R_MIPS_HI16, R_MIPS_LO16,
+      // R_MIPS_PCHI16 and R_MIPS_PCLO16 relocations.
+      if (RelType == ELF::R_MIPS_HI16 || RelType == ELF::R_MIPS_PCHI16)
         Value.Addend += (Opcode & 0x0000ffff) << 16;
       else if (RelType == ELF::R_MIPS_LO16)
         Value.Addend += (Opcode & 0x0000ffff);
       else if (RelType == ELF::R_MIPS_32)
         Value.Addend += Opcode;
+      else if (RelType == ELF::R_MIPS_PCLO16)
+        Value.Addend += SignExtend32<16>((Opcode & 0x0000ffff));
+      else if (RelType == ELF::R_MIPS_PC16)
+        Value.Addend += SignExtend32<18>((Opcode & 0x0000ffff) << 2);
+      else if (RelType == ELF::R_MIPS_PC19_S2)
+        Value.Addend += SignExtend32<21>((Opcode & 0x0007ffff) << 2);
+      else if (RelType == ELF::R_MIPS_PC21_S2)
+        Value.Addend += SignExtend32<23>((Opcode & 0x001fffff) << 2);
+      else if (RelType == ELF::R_MIPS_PC26_S2)
+        Value.Addend += SignExtend32<28>((Opcode & 0x03ffffff) << 2);
       processSimpleRelocation(SectionID, Offset, RelType, Value);
     }
   } else if (IsMipsN64ABI) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 74b13d60a984..c0741141757c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -89,19 +89,11 @@ RelocationValueRef RuntimeDyldMachO::getRelocationValueRef(
 }
 
 void RuntimeDyldMachO::makeValueAddendPCRel(RelocationValueRef &Value,
-                                            const ObjectFile &BaseTObj,
                                             const relocation_iterator &RI,
                                             unsigned OffsetToNextPC) {
-  const MachOObjectFile &Obj =
-      static_cast<const MachOObjectFile &>(BaseTObj);
-  MachO::any_relocation_info RelInfo =
-      Obj.getRelocation(RI->getRawDataRefImpl());
-
-  bool IsPCRel = Obj.getAnyRelocationPCRel(RelInfo);
-  if (IsPCRel) {
-    ErrorOr<uint64_t> RelocAddr = RI->getAddress();
-    Value.Offset += *RelocAddr + OffsetToNextPC;
-  }
+  auto &O = *cast<MachOObjectFile>(RI->getObject());
+  section_iterator SecI = O.getRelocationRelocatedSection(RI);
+  Value.Offset += RI->getOffset() + OffsetToNextPC + SecI->getAddress();
 }
 
 void RuntimeDyldMachO::dumpRelocationToResolve(const RelocationEntry &RE,
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 36ba8d1b93e7..0d7364f78597 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -95,7 +95,6 @@ protected:
 
   /// Make the RelocationValueRef addend PC-relative.
   void makeValueAddendPCRel(RelocationValueRef &Value,
-                            const ObjectFile &BaseTObj,
                             const relocation_iterator &RI,
                             unsigned OffsetToNextPC);
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
index 99fd6e333b47..7bf764114bae 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
@@ -284,7 +284,7 @@ public:
 
     bool IsExtern = Obj.getPlainRelocationExternal(RelInfo);
     if (!IsExtern && RE.IsPCRel)
-      makeValueAddendPCRel(Value, Obj, RelI, 1 << RE.Size);
+      makeValueAddendPCRel(Value, RelI, 1 << RE.Size);
 
     RE.Addend = Value.Offset;
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
index 0d9445e84f09..0a24bb2f5eae 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
@@ -74,7 +74,7 @@ public:
         getRelocationValueRef(Obj, RelI, RE, ObjSectionToID));
 
     if (RE.IsPCRel)
-      makeValueAddendPCRel(Value, Obj, RelI, 8);
+      makeValueAddendPCRel(Value, RelI, 8);
 
     if ((RE.RelType & 0xf) == MachO::ARM_RELOC_BR24)
       processBranchRelocation(RE, Value, Stubs);
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
index aceb304abb1e..569a078d7f3d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
@@ -68,7 +68,7 @@ public:
     //   Value.Addend += RelocAddr + 4;
     // }
     if (RE.IsPCRel)
-      makeValueAddendPCRel(Value, Obj, RelI, 1 << RE.Size);
+      makeValueAddendPCRel(Value, RelI, 1 << RE.Size);
 
     RE.Addend = Value.Offset;
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
index 4b3b01ba3c96..dd56e72f9144 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
@@ -50,7 +50,7 @@ public:
 
     bool IsExtern = Obj.getPlainRelocationExternal(RelInfo);
     if (!IsExtern && RE.IsPCRel)
-      makeValueAddendPCRel(Value, Obj, RelI, 1 << RE.Size);
+      makeValueAddendPCRel(Value, RelI, 1 << RE.Size);
 
     if (RE.RelType == MachO::X86_64_RELOC_GOT ||
         RE.RelType == MachO::X86_64_RELOC_GOT_LOAD)
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index c3032f4ffc79..546a98670a29 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -190,6 +190,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "sanitize_address";
   if (hasAttribute(Attribute::AlwaysInline))
     return "alwaysinline";
+  if (hasAttribute(Attribute::ArgMemOnly))
+    return "argmemonly";
   if (hasAttribute(Attribute::Builtin))
     return "builtin";
   if (hasAttribute(Attribute::ByVal))
@@ -447,6 +449,9 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
     llvm_unreachable("dereferenceable_or_null attribute not supported in raw "
                      "format");
     break;
+  case Attribute::ArgMemOnly:
+    llvm_unreachable("argmemonly attribute not supported in raw format");
+    break;
   }
   llvm_unreachable("Unsupported attribute type");
 }
@@ -1356,7 +1361,8 @@ AttrBuilder &AttrBuilder::addRawValue(uint64_t Val) {
   for (Attribute::AttrKind I = Attribute::None; I != Attribute::EndAttrKinds;
        I = Attribute::AttrKind(I + 1)) {
     if (I == Attribute::Dereferenceable ||
-        I == Attribute::DereferenceableOrNull)
+        I == Attribute::DereferenceableOrNull ||
+        I == Attribute::ArgMemOnly)
       continue;
     if (uint64_t A = (Val & AttributeImpl::getAttrMask(I))) {
       Attrs[I] = true;
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 70a55186ea9a..f1c6ebd4846e 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -229,6 +229,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
 bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
   NewFn = nullptr;
   bool Upgraded = UpgradeIntrinsicFunction1(F, NewFn);
+  assert(F != NewFn && "Intrinsic function upgraded to the same function");
 
   // Upgrade intrinsic attributes.  This does not change the function.
   if (NewFn)
@@ -710,16 +711,14 @@ void llvm::UpgradeCallsToIntrinsic(Function* F) {
   // Upgrade the function and check if it is a totaly new function.
   Function *NewFn;
   if (UpgradeIntrinsicFunction(F, NewFn)) {
-    if (NewFn != F) {
-      // Replace all uses to the old function with the new one if necessary.
-      for (Value::user_iterator UI = F->user_begin(), UE = F->user_end();
-           UI != UE; ) {
-        if (CallInst *CI = dyn_cast<CallInst>(*UI++))
-          UpgradeIntrinsicCall(CI, NewFn);
-      }
-      // Remove old function, no longer used, from the module.
-      F->eraseFromParent();
+    // Replace all uses to the old function with the new one if necessary.
+    for (Value::user_iterator UI = F->user_begin(), UE = F->user_end();
+         UI != UE;) {
+      if (CallInst *CI = dyn_cast<CallInst>(*UI++))
+        UpgradeIntrinsicCall(CI, NewFn);
     }
+    // Remove old function, no longer used, from the module.
+    F->eraseFromParent();
   }
 }
 
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index 77cb10d5b6ba..0a0449434a7b 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -163,47 +163,40 @@ CallInst *BasicBlock::getTerminatingMustTailCall() {
 }
 
 Instruction* BasicBlock::getFirstNonPHI() {
-  BasicBlock::iterator i = begin();
-  // All valid basic blocks should have a terminator,
-  // which is not a PHINode. If we have an invalid basic
-  // block we'll get an assertion failure when dereferencing
-  // a past-the-end iterator.
-  while (isa<PHINode>(i)) ++i;
-  return &*i;
+  for (Instruction &I : *this)
+    if (!isa<PHINode>(I))
+      return &I;
+  return nullptr;
 }
 
 Instruction* BasicBlock::getFirstNonPHIOrDbg() {
-  BasicBlock::iterator i = begin();
-  // All valid basic blocks should have a terminator,
-  // which is not a PHINode. If we have an invalid basic
-  // block we'll get an assertion failure when dereferencing
-  // a past-the-end iterator.
-  while (isa<PHINode>(i) || isa<DbgInfoIntrinsic>(i)) ++i;
-  return &*i;
+  for (Instruction &I : *this)
+    if (!isa<PHINode>(I) && !isa<DbgInfoIntrinsic>(I))
+      return &I;
+  return nullptr;
 }
 
 Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() {
-  // All valid basic blocks should have a terminator,
-  // which is not a PHINode. If we have an invalid basic
-  // block we'll get an assertion failure when dereferencing
-  // a past-the-end iterator.
-  BasicBlock::iterator i = begin();
-  for (;; ++i) {
-    if (isa<PHINode>(i) || isa<DbgInfoIntrinsic>(i))
+  for (Instruction &I : *this) {
+    if (isa<PHINode>(I) || isa<DbgInfoIntrinsic>(I))
       continue;
 
-    const IntrinsicInst *II = dyn_cast<IntrinsicInst>(i);
-    if (!II)
-      break;
-    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
-        II->getIntrinsicID() != Intrinsic::lifetime_end)
-      break;
+    if (auto *II = dyn_cast<IntrinsicInst>(&I))
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end)
+        continue;
+
+    return &I;
   }
-  return &*i;
+  return nullptr;
 }
 
 BasicBlock::iterator BasicBlock::getFirstInsertionPt() {
-  iterator InsertPt = getFirstNonPHI();
+  Instruction *FirstNonPHI = getFirstNonPHI();
+  if (!FirstNonPHI)
+    return end();
+
+  iterator InsertPt = FirstNonPHI;
   if (isa<LandingPadInst>(InsertPt)) ++InsertPt;
   return InsertPt;
 }
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 23e923d41126..e0e729d534bd 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -1691,6 +1691,14 @@ void LLVMDeleteFunction(LLVMValueRef Fn) {
   unwrap<Function>(Fn)->eraseFromParent();
 }
 
+LLVMValueRef LLVMGetPersonalityFn(LLVMValueRef Fn) {
+  return wrap(unwrap<Function>(Fn)->getPersonalityFn());
+}
+
+void LLVMSetPersonalityFn(LLVMValueRef Fn, LLVMValueRef PersonalityFn) {
+  unwrap<Function>(Fn)->setPersonalityFn(unwrap<Constant>(PersonalityFn));
+}
+
 unsigned LLVMGetIntrinsicID(LLVMValueRef Fn) {
   if (Function *F = dyn_cast<Function>(unwrap(Fn)))
     return F->getIntrinsicID();
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 6a3ff0e8e457..2a90e70af1a3 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -73,37 +73,47 @@ void DIBuilder::trackIfUnresolved(MDNode *N) {
 }
 
 void DIBuilder::finalize() {
-  if (CUNode) {
-    CUNode->replaceEnumTypes(MDTuple::get(VMContext, AllEnumTypes));
-
-    SmallVector<Metadata *, 16> RetainValues;
-    // Declarations and definitions of the same type may be retained. Some
-    // clients RAUW these pairs, leaving duplicates in the retained types
-    // list. Use a set to remove the duplicates while we transform the
-    // TrackingVHs back into Values.
-    SmallPtrSet<Metadata *, 16> RetainSet;
-    for (unsigned I = 0, E = AllRetainTypes.size(); I < E; I++)
-      if (RetainSet.insert(AllRetainTypes[I]).second)
-        RetainValues.push_back(AllRetainTypes[I]);
+  if (!CUNode) {
+    assert(!AllowUnresolvedNodes &&
+           "creating type nodes without a CU is not supported");
+    return;
+  }
+
+  CUNode->replaceEnumTypes(MDTuple::get(VMContext, AllEnumTypes));
+
+  SmallVector<Metadata *, 16> RetainValues;
+  // Declarations and definitions of the same type may be retained. Some
+  // clients RAUW these pairs, leaving duplicates in the retained types
+  // list. Use a set to remove the duplicates while we transform the
+  // TrackingVHs back into Values.
+  SmallPtrSet<Metadata *, 16> RetainSet;
+  for (unsigned I = 0, E = AllRetainTypes.size(); I < E; I++)
+    if (RetainSet.insert(AllRetainTypes[I]).second)
+      RetainValues.push_back(AllRetainTypes[I]);
+
+  if (!RetainValues.empty())
     CUNode->replaceRetainedTypes(MDTuple::get(VMContext, RetainValues));
 
-    DISubprogramArray SPs = MDTuple::get(VMContext, AllSubprograms);
+  DISubprogramArray SPs = MDTuple::get(VMContext, AllSubprograms);
+  if (!AllSubprograms.empty())
     CUNode->replaceSubprograms(SPs.get());
-    for (auto *SP : SPs) {
-      if (MDTuple *Temp = SP->getVariables().get()) {
-        const auto &PV = PreservedVariables.lookup(SP);
-        SmallVector<Metadata *, 4> Variables(PV.begin(), PV.end());
-        DINodeArray AV = getOrCreateArray(Variables);
-        TempMDTuple(Temp)->replaceAllUsesWith(AV.get());
-      }
+
+  for (auto *SP : SPs) {
+    if (MDTuple *Temp = SP->getVariables().get()) {
+      const auto &PV = PreservedVariables.lookup(SP);
+      SmallVector<Metadata *, 4> Variables(PV.begin(), PV.end());
+      DINodeArray AV = getOrCreateArray(Variables);
+      TempMDTuple(Temp)->replaceAllUsesWith(AV.get());
     }
+  }
 
+  if (!AllGVs.empty())
     CUNode->replaceGlobalVariables(MDTuple::get(VMContext, AllGVs));
 
+  if (!AllImportedModules.empty())
     CUNode->replaceImportedEntities(MDTuple::get(
         VMContext, SmallVector<Metadata *, 16>(AllImportedModules.begin(),
                                                AllImportedModules.end())));
-  }
 
   // Now that all temp nodes have been replaced or deleted, resolve remaining
   // cycles.
@@ -585,7 +595,7 @@ DILocalVariable *DIBuilder::createLocalVariable(
     DIType *Ty, bool AlwaysPreserve, unsigned Flags, unsigned ArgNo) {
   // FIXME: Why getNonCompileUnitScope()?
   // FIXME: Why is "!Context" okay here?
-  // FIXME: WHy doesn't this check for a subprogram or lexical block (AFAICT
+  // FIXME: Why doesn't this check for a subprogram or lexical block (AFAICT
   // the only valid scopes)?
   DIScope *Context = getNonCompileUnitScope(Scope);
 
@@ -593,7 +603,7 @@ DILocalVariable *DIBuilder::createLocalVariable(
       VMContext, Tag, cast_or_null<DILocalScope>(Context), Name, File, LineNo,
       DITypeRef::get(Ty), ArgNo, Flags);
   if (AlwaysPreserve) {
-    // The optimizer may remove local variable. If there is an interest
+    // The optimizer may remove local variables. If there is an interest
     // to preserve variable info in such situation then stash it in a
     // named mdnode.
     DISubprogram *Fn = getDISubprogram(Scope);
@@ -857,7 +867,7 @@ void DIBuilder::replaceArrays(DICompositeType *&T, DINodeArray Elements,
   if (!T->isResolved())
     return;
 
-  // If "T" is resolved, it may be due to a self-reference cycle.  Track the
+  // If T is resolved, it may be due to a self-reference cycle.  Track the
   // arrays explicitly if they're unresolved, or else the cycles will be
   // orphaned.
   if (Elements)
diff --git a/lib/IR/Dominators.cpp b/lib/IR/Dominators.cpp
index e3258895ea5e..b6a8bbcbe5fa 100644
--- a/lib/IR/Dominators.cpp
+++ b/lib/IR/Dominators.cpp
@@ -62,18 +62,14 @@ bool BasicBlockEdge::isSingleEdge() const {
 //
 //===----------------------------------------------------------------------===//
 
-TEMPLATE_INSTANTIATION(class llvm::DomTreeNodeBase<BasicBlock>);
-TEMPLATE_INSTANTIATION(class llvm::DominatorTreeBase<BasicBlock>);
-
-#define LLVM_COMMA ,
-TEMPLATE_INSTANTIATION(void llvm::Calculate<Function LLVM_COMMA BasicBlock *>(
-    DominatorTreeBase<GraphTraits<BasicBlock *>::NodeType> &DT LLVM_COMMA
-        Function &F));
-TEMPLATE_INSTANTIATION(
-    void llvm::Calculate<Function LLVM_COMMA Inverse<BasicBlock *> >(
-        DominatorTreeBase<GraphTraits<Inverse<BasicBlock *> >::NodeType> &DT
-            LLVM_COMMA Function &F));
-#undef LLVM_COMMA
+template class llvm::DomTreeNodeBase<BasicBlock>;
+template class llvm::DominatorTreeBase<BasicBlock>;
+
+template void llvm::Calculate<Function, BasicBlock *>(
+    DominatorTreeBase<GraphTraits<BasicBlock *>::NodeType> &DT, Function &F);
+template void llvm::Calculate<Function, Inverse<BasicBlock *>>(
+    DominatorTreeBase<GraphTraits<Inverse<BasicBlock *>>::NodeType> &DT,
+    Function &F);
 
 // dominates - Return true if Def dominates a use in User. This performs
 // the special checks necessary if Def and User are in the same basic block.
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 78d1adb5e700..f554d590284f 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -39,8 +39,6 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 //                                Value Class
 //===----------------------------------------------------------------------===//
-const unsigned Value::NumUserOperandsBits;
-
 static inline Type *checkType(Type *Ty) {
   assert(Ty && "Value defined with a null type: Error!");
   return Ty;
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 3c61165768f8..2a0a4ff393ed 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -184,12 +184,12 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// \brief Track unresolved string-based type references.
   SmallDenseMap<const MDString *, const MDNode *, 32> UnresolvedTypeRefs;
 
-  /// \brief Whether we've seen a call to @llvm.frameescape in this function
+  /// \brief Whether we've seen a call to @llvm.localescape in this function
   /// already.
   bool SawFrameEscape;
 
-  /// Stores the count of how many objects were passed to llvm.frameescape for a
-  /// given function and the largest index passed to llvm.framerecover.
+  /// Stores the count of how many objects were passed to llvm.localescape for a
+  /// given function and the largest index passed to llvm.localrecover.
   DenseMap<Function *, std::pair<unsigned, unsigned>> FrameEscapeInfo;
 
 public:
@@ -438,6 +438,9 @@ void Verifier::visitGlobalValue(const GlobalValue &GV) {
     Assert(GVar && GVar->getValueType()->isArrayTy(),
            "Only global arrays can have appending linkage!", GVar);
   }
+
+  if (GV.isDeclarationForLinker())
+    Assert(!GV.hasComdat(), "Declaration may not be in a Comdat!", &GV);
 }
 
 void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
@@ -1270,7 +1273,8 @@ void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
         I->getKindAsEnum() == Attribute::Cold ||
         I->getKindAsEnum() == Attribute::OptimizeNone ||
         I->getKindAsEnum() == Attribute::JumpTable ||
-        I->getKindAsEnum() == Attribute::Convergent) {
+        I->getKindAsEnum() == Attribute::Convergent ||
+        I->getKindAsEnum() == Attribute::ArgMemOnly) {
       if (!isFunction) {
         CheckFailed("Attribute '" + I->getAsString() +
                     "' only applies to functions!", V);
@@ -1528,8 +1532,9 @@ void Verifier::VerifyStatepoint(ImmutableCallSite CS) {
 
   const Instruction &CI = *CS.getInstruction();
 
-  Assert(!CS.doesNotAccessMemory() && !CS.onlyReadsMemory(),
-         "gc.statepoint must read and write memory to preserve "
+  Assert(!CS.doesNotAccessMemory() && !CS.onlyReadsMemory() &&
+         !CS.onlyAccessesArgMemory(),
+         "gc.statepoint must read and write all memory to preserve "
          "reordering restrictions required by safepoint semantics",
          &CI);
 
@@ -1666,8 +1671,8 @@ void Verifier::verifyFrameRecoverIndices() {
     unsigned EscapedObjectCount = Counts.second.first;
     unsigned MaxRecoveredIndex = Counts.second.second;
     Assert(MaxRecoveredIndex <= EscapedObjectCount,
-           "all indices passed to llvm.framerecover must be less than the "
-           "number of arguments passed ot llvm.frameescape in the parent "
+           "all indices passed to llvm.localrecover must be less than the "
+           "number of arguments passed ot llvm.localescape in the parent "
            "function",
            F);
   }
@@ -2535,10 +2540,6 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   Assert(isa<PointerType>(TargetTy),
          "GEP base pointer is not a vector or a vector of pointers", &GEP);
   Assert(GEP.getSourceElementType()->isSized(), "GEP into unsized type!", &GEP);
-  Assert(GEP.getPointerOperandType()->isVectorTy() ==
-             GEP.getType()->isVectorTy(),
-         "Vector GEP must return a vector value", &GEP);
-
   SmallVector<Value*, 16> Idxs(GEP.idx_begin(), GEP.idx_end());
   Type *ElTy =
       GetElementPtrInst::getIndexedType(GEP.getSourceElementType(), Idxs);
@@ -2548,17 +2549,20 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
              GEP.getResultElementType() == ElTy,
          "GEP is not of right type for indices!", &GEP, ElTy);
 
-  if (GEP.getPointerOperandType()->isVectorTy()) {
+  if (GEP.getType()->isVectorTy()) {
     // Additional checks for vector GEPs.
-    unsigned GepWidth = GEP.getPointerOperandType()->getVectorNumElements();
-    Assert(GepWidth == GEP.getType()->getVectorNumElements(),
-           "Vector GEP result width doesn't match operand's", &GEP);
+    unsigned GEPWidth = GEP.getType()->getVectorNumElements();
+    if (GEP.getPointerOperandType()->isVectorTy())
+      Assert(GEPWidth == GEP.getPointerOperandType()->getVectorNumElements(),
+             "Vector GEP result width doesn't match operand's", &GEP);
     for (unsigned i = 0, e = Idxs.size(); i != e; ++i) {
       Type *IndexTy = Idxs[i]->getType();
-      Assert(IndexTy->isVectorTy(), "Vector GEP must have vector indices!",
-             &GEP);
-      unsigned IndexWidth = IndexTy->getVectorNumElements();
-      Assert(IndexWidth == GepWidth, "Invalid GEP index vector width", &GEP);
+      if (IndexTy->isVectorTy()) {
+        unsigned IndexWidth = IndexTy->getVectorNumElements();
+        Assert(IndexWidth == GEPWidth, "Invalid GEP index vector width", &GEP);
+      }
+      Assert(IndexTy->getScalarType()->isIntegerTy(),
+             "All GEP indices should be of integer type");
     }
   }
   visitInstruction(GEP);
@@ -3276,32 +3280,32 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
            "llvm.invariant.end parameter #2 must be a constant integer", CS);
     break;
 
-  case Intrinsic::frameescape: {
+  case Intrinsic::localescape: {
     BasicBlock *BB = CS.getParent();
     Assert(BB == &BB->getParent()->front(),
-           "llvm.frameescape used outside of entry block", CS);
+           "llvm.localescape used outside of entry block", CS);
     Assert(!SawFrameEscape,
-           "multiple calls to llvm.frameescape in one function", CS);
+           "multiple calls to llvm.localescape in one function", CS);
     for (Value *Arg : CS.args()) {
       if (isa<ConstantPointerNull>(Arg))
         continue; // Null values are allowed as placeholders.
       auto *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts());
       Assert(AI && AI->isStaticAlloca(),
-             "llvm.frameescape only accepts static allocas", CS);
+             "llvm.localescape only accepts static allocas", CS);
     }
     FrameEscapeInfo[BB->getParent()].first = CS.getNumArgOperands();
     SawFrameEscape = true;
     break;
   }
-  case Intrinsic::framerecover: {
+  case Intrinsic::localrecover: {
     Value *FnArg = CS.getArgOperand(0)->stripPointerCasts();
     Function *Fn = dyn_cast<Function>(FnArg);
     Assert(Fn && !Fn->isDeclaration(),
-           "llvm.framerecover first "
+           "llvm.localrecover first "
            "argument must be function defined in this module",
            CS);
     auto *IdxArg = dyn_cast<ConstantInt>(CS.getArgOperand(2));
-    Assert(IdxArg, "idx argument of llvm.framerecover must be a constant int",
+    Assert(IdxArg, "idx argument of llvm.localrecover must be a constant int",
            CS);
     auto &Entry = FrameEscapeInfo[Fn];
     Entry.second = unsigned(
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index 6131c3180249..53ed4175f8e3 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -473,6 +473,9 @@ void LTOModule::addDefinedSymbol(const char *Name, const GlobalValue *def,
   if (def->hasComdat())
     attr |= LTO_SYMBOL_COMDAT;
 
+  if (isa<GlobalAlias>(def))
+    attr |= LTO_SYMBOL_ALIAS;
+
   auto Iter = _defines.insert(Name).first;
 
   // fill information structure
diff --git a/lib/LibDriver/LibDriver.cpp b/lib/LibDriver/LibDriver.cpp
index cb3278c716e6..b33a22ff0cf8 100644
--- a/lib/LibDriver/LibDriver.cpp
+++ b/lib/LibDriver/LibDriver.cpp
@@ -56,17 +56,13 @@ public:
 
 }
 
-static std::string getOutputPath(llvm::opt::InputArgList *Args) {
+static std::string getOutputPath(llvm::opt::InputArgList *Args,
+                                 const llvm::NewArchiveIterator &FirstMember) {
   if (auto *Arg = Args->getLastArg(OPT_out))
     return Arg->getValue();
-  for (auto *Arg : Args->filtered(OPT_INPUT)) {
-    if (!StringRef(Arg->getValue()).endswith_lower(".obj"))
-      continue;
-    SmallString<128> Val = StringRef(Arg->getValue());
-    llvm::sys::path::replace_extension(Val, ".lib");
-    return Val.str();
-  }
-  llvm_unreachable("internal error");
+  SmallString<128> Val = FirstMember.getNew();
+  llvm::sys::path::replace_extension(Val, ".lib");
+  return Val.str();
 }
 
 static std::vector<StringRef> getSearchPaths(llvm::opt::InputArgList *Args,
@@ -144,7 +140,10 @@ int llvm::libDriverMain(llvm::ArrayRef<const char*> ArgsArr) {
   }
 
   std::pair<StringRef, std::error_code> Result =
-      llvm::writeArchive(getOutputPath(&Args), Members, /*WriteSymtab=*/true);
+      llvm::writeArchive(getOutputPath(&Args, Members[0]), Members,
+                         /*WriteSymtab=*/true, object::Archive::K_GNU,
+                         /*Deterministic*/ true);
+
   if (Result.second) {
     if (Result.first.empty())
       Result.first = ArgsArr[0];
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 13c5ca9561df..6554d6a9e60e 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -28,6 +28,7 @@ add_llvm_library(LLVMMC
   MCObjectStreamer.cpp
   MCObjectWriter.cpp
   MCRegisterInfo.cpp
+  MCSchedule.cpp
   MCSection.cpp
   MCSectionCOFF.cpp
   MCSectionELF.cpp
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 9a65a3158972..227c937e8d1b 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -503,7 +503,8 @@ void MCAsmStreamer::EndCOFFSymbolDef() {
 }
 
 void MCAsmStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {
-  OS << "\t.safeseh\t" << *Symbol;
+  OS << "\t.safeseh\t";
+  Symbol->print(OS, MAI);
   EmitEOL();
 }
 
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index da6516a4ac92..f53b589e1aea 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -925,7 +925,7 @@ void MCAssembler::Finish() {
         Fixups = FragWithFixups->getFixups();
         Contents = FragWithFixups->getContents();
       } else
-        llvm_unreachable("Unknow fragment with fixups!");
+        llvm_unreachable("Unknown fragment with fixups!");
       for (const MCFixup &Fixup : Fixups) {
         uint64_t FixedValue;
         bool IsPCRel;
diff --git a/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp b/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
index 68948d36d65c..5fc2ca44f5d4 100644
--- a/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
+++ b/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
@@ -16,6 +16,10 @@
 
 using namespace llvm;
 
+namespace llvm {
+class Triple;
+}
+
 // This function tries to add a symbolic operand in place of the immediate
 // Value in the MCInst. The immediate Value has had any PC adjustment made by
 // the caller. If the instruction is a branch instruction then IsBranch is true,
@@ -184,7 +188,7 @@ void MCExternalSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
 }
 
 namespace llvm {
-MCSymbolizer *createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
+MCSymbolizer *createMCSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
                                  LLVMSymbolLookupCallback SymbolLookUp,
                                  void *DisInfo, MCContext *Ctx,
                                  std::unique_ptr<MCRelocationInfo> &&RelInfo) {
diff --git a/lib/MC/MCInstrDesc.cpp b/lib/MC/MCInstrDesc.cpp
index decc2d84b252..5be2fa1b30b6 100644
--- a/lib/MC/MCInstrDesc.cpp
+++ b/lib/MC/MCInstrDesc.cpp
@@ -19,7 +19,7 @@
 
 using namespace llvm;
 
-bool MCInstrDesc::getDeprecatedInfo(MCInst &MI, MCSubtargetInfo &STI,
+bool MCInstrDesc::getDeprecatedInfo(MCInst &MI, const MCSubtargetInfo &STI,
                                     std::string &Info) const {
   if (ComplexDeprecationInfo)
     return ComplexDeprecationInfo(MI, STI, Info);
diff --git a/lib/MC/MCSchedule.cpp b/lib/MC/MCSchedule.cpp
new file mode 100644
index 000000000000..f3919427bf05
--- /dev/null
+++ b/lib/MC/MCSchedule.cpp
@@ -0,0 +1,34 @@
+//===- MCSchedule.cpp - Scheduling ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the default scheduling model.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSchedule.h"
+#include <type_traits>
+
+using namespace llvm;
+
+static_assert(std::is_pod<MCSchedModel>::value,
+              "We shouldn't have a static constructor here");
+const MCSchedModel MCSchedModel::Default = {DefaultIssueWidth,
+                                            DefaultMicroOpBufferSize,
+                                            DefaultLoopMicroOpBufferSize,
+                                            DefaultLoadLatency,
+                                            DefaultHighLatency,
+                                            DefaultMispredictPenalty,
+                                            false,
+                                            true,
+                                            0,
+                                            nullptr,
+                                            nullptr,
+                                            0,
+                                            0,
+                                            nullptr};
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index ece775c4f08f..9210cf544b16 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -17,42 +17,34 @@
 
 using namespace llvm;
 
-/// InitMCProcessorInfo - Set or change the CPU (optionally supplemented
-/// with feature string). Recompute feature bits and scheduling model.
-void
-MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef FS) {
+static FeatureBitset getFeatures(StringRef CPU, StringRef FS,
+                                 ArrayRef<SubtargetFeatureKV> ProcDesc,
+                                 ArrayRef<SubtargetFeatureKV> ProcFeatures) {
   SubtargetFeatures Features(FS);
-  FeatureBits = Features.getFeatureBits(CPU, ProcDesc, ProcFeatures);
-  InitCPUSchedModel(CPU);
+  return Features.getFeatureBits(CPU, ProcDesc, ProcFeatures);
 }
 
-void
-MCSubtargetInfo::InitCPUSchedModel(StringRef CPU) {
+void MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef FS) {
+  FeatureBits = getFeatures(CPU, FS, ProcDesc, ProcFeatures);
   if (!CPU.empty())
-    CPUSchedModel = getSchedModelForCPU(CPU);
+    CPUSchedModel = &getSchedModelForCPU(CPU);
   else
-    CPUSchedModel = MCSchedModel::GetDefaultSchedModel();
+    CPUSchedModel = &MCSchedModel::GetDefaultSchedModel();
 }
 
-void MCSubtargetInfo::InitMCSubtargetInfo(
+void MCSubtargetInfo::setDefaultFeatures(StringRef CPU) {
+  FeatureBits = getFeatures(CPU, "", ProcDesc, ProcFeatures);
+}
+
+MCSubtargetInfo::MCSubtargetInfo(
     const Triple &TT, StringRef C, StringRef FS,
     ArrayRef<SubtargetFeatureKV> PF, ArrayRef<SubtargetFeatureKV> PD,
     const SubtargetInfoKV *ProcSched, const MCWriteProcResEntry *WPR,
     const MCWriteLatencyEntry *WL, const MCReadAdvanceEntry *RA,
-    const InstrStage *IS, const unsigned *OC, const unsigned *FP) {
-  TargetTriple = TT;
-  CPU = C;
-  ProcFeatures = PF;
-  ProcDesc = PD;
-  ProcSchedModels = ProcSched;
-  WriteProcResTable = WPR;
-  WriteLatencyTable = WL;
-  ReadAdvanceTable = RA;
-
-  Stages = IS;
-  OperandCycles = OC;
-  ForwardingPaths = FP;
-
+    const InstrStage *IS, const unsigned *OC, const unsigned *FP)
+    : TargetTriple(TT), CPU(C), ProcFeatures(PF), ProcDesc(PD),
+      ProcSchedModels(ProcSched), WriteProcResTable(WPR), WriteLatencyTable(WL),
+      ReadAdvanceTable(RA), Stages(IS), OperandCycles(OC), ForwardingPaths(FP) {
   InitMCProcessorInfo(CPU, FS);
 }
 
@@ -82,8 +74,7 @@ FeatureBitset MCSubtargetInfo::ApplyFeatureFlag(StringRef FS) {
   return FeatureBits;
 }
 
-MCSchedModel
-MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
+const MCSchedModel &MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
   assert(ProcSchedModels && "Processor machine model not available!");
 
   unsigned NumProcs = ProcDesc.size();
@@ -116,6 +107,6 @@ MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
 
 /// Initialize an InstrItineraryData instance.
 void MCSubtargetInfo::initInstrItins(InstrItineraryData &InstrItins) const {
-  InstrItins =
-    InstrItineraryData(CPUSchedModel, Stages, OperandCycles, ForwardingPaths);
+  InstrItins = InstrItineraryData(getSchedModel(), Stages, OperandCycles,
+                                  ForwardingPaths);
 }
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index affc57471fdb..125380a9d140 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -19,9 +19,6 @@ using namespace llvm;
 // Sentinel value for the absolute pseudo section.
 MCSection *MCSymbol::AbsolutePseudoSection = reinterpret_cast<MCSection *>(1);
 
-const unsigned MCSymbol::NumCommonAlignmentBits;
-const unsigned MCSymbol::NumFlagsBits;
-
 void *MCSymbol::operator new(size_t s, const StringMapEntry<bool> *Name,
                              MCContext &Ctx) {
   // We may need more space for a Name to account for alignment.  So allocate
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index 54ed954a90d9..d4821196a6cf 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 
 using namespace llvm;
 using namespace object;
@@ -115,6 +116,23 @@ uint64_t Archive::Child::getRawSize() const {
   return getHeader()->getSize();
 }
 
+ErrorOr<StringRef> Archive::Child::getBuffer() const {
+  if (!Parent->IsThin)
+    return StringRef(Data.data() + StartOfFile, getSize());
+  ErrorOr<StringRef> Name = getName();
+  if (std::error_code EC = Name.getError())
+    return EC;
+  SmallString<128> FullName =
+      Parent->getMemoryBufferRef().getBufferIdentifier();
+  sys::path::remove_filename(FullName);
+  sys::path::append(FullName, *Name);
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getFile(FullName);
+  if (std::error_code EC = Buf.getError())
+    return EC;
+  Parent->ThinBuffers.push_back(std::move(*Buf));
+  return Parent->ThinBuffers.back()->getBuffer();
+}
+
 Archive::Child Archive::Child::getNext() const {
   size_t SpaceToSkip = Data.size();
   // If it's odd, add 1 to make it even.
@@ -162,10 +180,10 @@ ErrorOr<StringRef> Archive::Child::getName() const {
                    + Parent->StringTable->getSize()))
       return object_error::parse_failed;
 
-    // GNU long file names end with a /.
+    // GNU long file names end with a "/\n".
     if (Parent->kind() == K_GNU || Parent->kind() == K_MIPS64) {
-      StringRef::size_type End = StringRef(addr).find('/');
-      return StringRef(addr, End);
+      StringRef::size_type End = StringRef(addr).find('\n');
+      return StringRef(addr, End - 1);
     }
     return StringRef(addr);
   } else if (name.startswith("#1/")) {
@@ -186,7 +204,10 @@ ErrorOr<MemoryBufferRef> Archive::Child::getMemoryBufferRef() const {
   if (std::error_code EC = NameOrErr.getError())
     return EC;
   StringRef Name = NameOrErr.get();
-  return MemoryBufferRef(getBuffer(), Name);
+  ErrorOr<StringRef> Buf = getBuffer();
+  if (std::error_code EC = Buf.getError())
+    return EC;
+  return MemoryBufferRef(*Buf, Name);
 }
 
 ErrorOr<std::unique_ptr<Binary>>
@@ -207,7 +228,8 @@ ErrorOr<std::unique_ptr<Archive>> Archive::create(MemoryBufferRef Source) {
 }
 
 Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
-    : Binary(Binary::ID_Archive, Source), SymbolTable(child_end()) {
+    : Binary(Binary::ID_Archive, Source), SymbolTable(child_end()),
+      StringTable(child_end()), FirstRegular(child_end()) {
   StringRef Buffer = Data.getBuffer();
   // Check for sufficient magic.
   if (Buffer.startswith(ThinMagic)) {
@@ -287,7 +309,7 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
 
     ++i;
     if (i == e) {
-      ec = object_error::parse_failed;
+      ec = std::error_code();
       return;
     }
     Name = i->getRawName();
@@ -352,11 +374,11 @@ Archive::child_iterator Archive::child_end() const {
 }
 
 StringRef Archive::Symbol::getName() const {
-  return Parent->SymbolTable->getBuffer().begin() + StringIndex;
+  return Parent->getSymbolTable().begin() + StringIndex;
 }
 
 ErrorOr<Archive::child_iterator> Archive::Symbol::getMember() const {
-  const char *Buf = Parent->SymbolTable->getBuffer().begin();
+  const char *Buf = Parent->getSymbolTable().begin();
   const char *Offsets = Buf;
   if (Parent->kind() == K_MIPS64)
     Offsets += sizeof(uint64_t);
@@ -420,7 +442,7 @@ Archive::Symbol Archive::Symbol::getNext() const {
     // and the second being the offset into the archive of the member that
     // define the symbol. After that the next uint32_t is the byte count of
     // the string table followed by the string table.
-    const char *Buf = Parent->SymbolTable->getBuffer().begin();
+    const char *Buf = Parent->getSymbolTable().begin();
     uint32_t RanlibCount = 0;
     RanlibCount = read32le(Buf) / 8;
     // If t.SymbolIndex + 1 will be past the count of symbols (the RanlibCount)
@@ -437,8 +459,7 @@ Archive::Symbol Archive::Symbol::getNext() const {
     }
   } else {
     // Go to one past next null.
-    t.StringIndex =
-        Parent->SymbolTable->getBuffer().find('\0', t.StringIndex) + 1;
+    t.StringIndex = Parent->getSymbolTable().find('\0', t.StringIndex) + 1;
   }
   ++t.SymbolIndex;
   return t;
@@ -448,7 +469,7 @@ Archive::symbol_iterator Archive::symbol_begin() const {
   if (!hasSymbolTable())
     return symbol_iterator(Symbol(this, 0, 0));
 
-  const char *buf = SymbolTable->getBuffer().begin();
+  const char *buf = getSymbolTable().begin();
   if (kind() == K_GNU) {
     uint32_t symbol_count = 0;
     symbol_count = read32be(buf);
@@ -480,7 +501,7 @@ Archive::symbol_iterator Archive::symbol_begin() const {
     symbol_count = read32le(buf);
     buf += 4 + (symbol_count * 2); // Skip indices.
   }
-  uint32_t string_start_offset = buf - SymbolTable->getBuffer().begin();
+  uint32_t string_start_offset = buf - getSymbolTable().begin();
   return symbol_iterator(Symbol(this, 0, string_start_offset));
 }
 
@@ -491,7 +512,7 @@ Archive::symbol_iterator Archive::symbol_end() const {
 }
 
 uint32_t Archive::getNumberOfSymbols() const {
-  const char *buf = SymbolTable->getBuffer().begin();
+  const char *buf = getSymbolTable().begin();
   if (kind() == K_GNU)
     return read32be(buf);
   if (kind() == K_MIPS64)
diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp
index 00a56d13bfed..a40901c924ea 100644
--- a/lib/Object/ArchiveWriter.cpp
+++ b/lib/Object/ArchiveWriter.cpp
@@ -34,8 +34,6 @@
 
 using namespace llvm;
 
-NewArchiveIterator::NewArchiveIterator() {}
-
 NewArchiveIterator::NewArchiveIterator(object::Archive::child_iterator I,
                                        StringRef Name)
     : IsNewMember(false), Name(Name), OldI(I) {}
@@ -93,8 +91,12 @@ static void printWithSpacePadding(raw_fd_ostream &OS, T Data, unsigned Size,
   }
 }
 
-static void print32BE(raw_ostream &Out, uint32_t Val) {
-  support::endian::Writer<support::big>(Out).write(Val);
+static void print32(raw_ostream &Out, object::Archive::Kind Kind,
+                    uint32_t Val) {
+  if (Kind == object::Archive::K_GNU)
+    support::endian::Writer<support::big>(Out).write(Val);
+  else
+    support::endian::Writer<support::little>(Out).write(Val);
 }
 
 static void printRestOfMemberHeader(raw_fd_ostream &Out,
@@ -109,18 +111,42 @@ static void printRestOfMemberHeader(raw_fd_ostream &Out,
   Out << "`\n";
 }
 
-static void printMemberHeader(raw_fd_ostream &Out, StringRef Name,
-                              const sys::TimeValue &ModTime, unsigned UID,
-                              unsigned GID, unsigned Perms, unsigned Size) {
+static void printGNUSmallMemberHeader(raw_fd_ostream &Out, StringRef Name,
+                                      const sys::TimeValue &ModTime,
+                                      unsigned UID, unsigned GID,
+                                      unsigned Perms, unsigned Size) {
   printWithSpacePadding(Out, Twine(Name) + "/", 16);
   printRestOfMemberHeader(Out, ModTime, UID, GID, Perms, Size);
 }
 
-static void printMemberHeader(raw_fd_ostream &Out, unsigned NameOffset,
-                              const sys::TimeValue &ModTime, unsigned UID,
-                              unsigned GID, unsigned Perms, unsigned Size) {
+static void printBSDMemberHeader(raw_fd_ostream &Out, StringRef Name,
+                                 const sys::TimeValue &ModTime, unsigned UID,
+                                 unsigned GID, unsigned Perms, unsigned Size) {
+  uint64_t PosAfterHeader = Out.tell() + 60 + Name.size();
+  // Pad so that even 64 bit object files are aligned.
+  unsigned Pad = OffsetToAlignment(PosAfterHeader, 8);
+  unsigned NameWithPadding = Name.size() + Pad;
+  printWithSpacePadding(Out, Twine("#1/") + Twine(NameWithPadding), 16);
+  printRestOfMemberHeader(Out, ModTime, UID, GID, Perms,
+                          NameWithPadding + Size);
+  Out << Name;
+  assert(PosAfterHeader == Out.tell());
+  while (Pad--)
+    Out.write(uint8_t(0));
+}
+
+static void
+printMemberHeader(raw_fd_ostream &Out, object::Archive::Kind Kind,
+                  StringRef Name,
+                  std::vector<unsigned>::iterator &StringMapIndexIter,
+                  const sys::TimeValue &ModTime, unsigned UID, unsigned GID,
+                  unsigned Perms, unsigned Size) {
+  if (Kind == object::Archive::K_BSD)
+    return printBSDMemberHeader(Out, Name, ModTime, UID, GID, Perms, Size);
+  if (Name.size() < 16)
+    return printGNUSmallMemberHeader(Out, Name, ModTime, UID, GID, Perms, Size);
   Out << '/';
-  printWithSpacePadding(Out, NameOffset, 15);
+  printWithSpacePadding(Out, *StringMapIndexIter++, 15);
   printRestOfMemberHeader(Out, ModTime, UID, GID, Perms, Size);
 }
 
@@ -152,20 +178,26 @@ static void writeStringTable(raw_fd_ostream &Out,
   Out.seek(Pos);
 }
 
+static sys::TimeValue now(bool Deterministic) {
+  if (!Deterministic)
+    return sys::TimeValue::now();
+  sys::TimeValue TV;
+  TV.fromEpochTime(0);
+  return TV;
+}
+
 // Returns the offset of the first reference to a member offset.
 static ErrorOr<unsigned>
-writeSymbolTable(raw_fd_ostream &Out, ArrayRef<NewArchiveIterator> Members,
+writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
+                 ArrayRef<NewArchiveIterator> Members,
                  ArrayRef<MemoryBufferRef> Buffers,
-                 std::vector<unsigned> &MemberOffsetRefs) {
-  unsigned StartOffset = 0;
-  unsigned MemberNum = 0;
-  std::string NameBuf;
-  raw_string_ostream NameOS(NameBuf);
-  unsigned NumSyms = 0;
+                 std::vector<unsigned> &MemberOffsetRefs, bool Deterministic) {
+  unsigned HeaderStartOffset = 0;
+  unsigned BodyStartOffset = 0;
+  SmallString<128> NameBuf;
+  raw_svector_ostream NameOS(NameBuf);
   LLVMContext Context;
-  for (ArrayRef<NewArchiveIterator>::iterator I = Members.begin(),
-                                              E = Members.end();
-       I != E; ++I, ++MemberNum) {
+  for (unsigned MemberNum = 0, N = Members.size(); MemberNum < N; ++MemberNum) {
     MemoryBufferRef MemberBuffer = Buffers[MemberNum];
     ErrorOr<std::unique_ptr<object::SymbolicFile>> ObjOrErr =
         object::SymbolicFile::createSymbolicFile(
@@ -174,10 +206,14 @@ writeSymbolTable(raw_fd_ostream &Out, ArrayRef<NewArchiveIterator> Members,
       continue;  // FIXME: check only for "not an object file" errors.
     object::SymbolicFile &Obj = *ObjOrErr.get();
 
-    if (!StartOffset) {
-      printMemberHeader(Out, "", sys::TimeValue::now(), 0, 0, 0, 0);
-      StartOffset = Out.tell();
-      print32BE(Out, 0);
+    if (!HeaderStartOffset) {
+      HeaderStartOffset = Out.tell();
+      if (Kind == object::Archive::K_GNU)
+        printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, 0);
+      else
+        printBSDMemberHeader(Out, "__.SYMDEF", now(Deterministic), 0, 0, 0, 0);
+      BodyStartOffset = Out.tell();
+      print32(Out, Kind, 0); // number of entries or bytes
     }
 
     for (const object::BasicSymbolRef &S : Obj.symbols()) {
@@ -188,35 +224,53 @@ writeSymbolTable(raw_fd_ostream &Out, ArrayRef<NewArchiveIterator> Members,
         continue;
       if (Symflags & object::SymbolRef::SF_Undefined)
         continue;
+
+      unsigned NameOffset = NameOS.tell();
       if (auto EC = S.printName(NameOS))
         return EC;
       NameOS << '\0';
-      ++NumSyms;
       MemberOffsetRefs.push_back(MemberNum);
-      print32BE(Out, 0);
+      if (Kind == object::Archive::K_BSD)
+        print32(Out, Kind, NameOffset);
+      print32(Out, Kind, 0); // member offset
     }
   }
-  Out << NameOS.str();
 
-  if (StartOffset == 0)
+  if (HeaderStartOffset == 0)
     return 0;
 
-  if (Out.tell() % 2)
-    Out << '\0';
+  StringRef StringTable = NameOS.str();
+  if (Kind == object::Archive::K_BSD)
+    print32(Out, Kind, StringTable.size()); // byte count of the string table
+  Out << StringTable;
+
+  // ld64 requires the next member header to start at an offset that is
+  // 4 bytes aligned.
+  unsigned Pad = OffsetToAlignment(Out.tell(), 4);
+  while (Pad--)
+    Out.write(uint8_t(0));
 
+  // Patch up the size of the symbol table now that we know how big it is.
   unsigned Pos = Out.tell();
-  Out.seek(StartOffset - 12);
-  printWithSpacePadding(Out, Pos - StartOffset, 10);
-  Out.seek(StartOffset);
-  print32BE(Out, NumSyms);
+  const unsigned MemberHeaderSize = 60;
+  Out.seek(HeaderStartOffset + 48); // offset of the size field.
+  printWithSpacePadding(Out, Pos - MemberHeaderSize - HeaderStartOffset, 10);
+
+  // Patch up the number of symbols.
+  Out.seek(BodyStartOffset);
+  unsigned NumSyms = MemberOffsetRefs.size();
+  if (Kind == object::Archive::K_GNU)
+    print32(Out, Kind, NumSyms);
+  else
+    print32(Out, Kind, NumSyms * 8);
+
   Out.seek(Pos);
-  return StartOffset + 4;
+  return BodyStartOffset + 4;
 }
 
-std::pair<StringRef, std::error_code>
-llvm::writeArchive(StringRef ArcName,
-                   std::vector<NewArchiveIterator> &NewMembers,
-                   bool WriteSymtab) {
+std::pair<StringRef, std::error_code> llvm::writeArchive(
+    StringRef ArcName, std::vector<NewArchiveIterator> &NewMembers,
+    bool WriteSymtab, object::Archive::Kind Kind, bool Deterministic) {
   SmallString<128> TmpArchive;
   int TmpArchiveFD;
   if (auto EC = sys::fs::createUniqueFile(ArcName + ".temp-archive-%%%%%%%.a",
@@ -267,56 +321,60 @@ llvm::writeArchive(StringRef ArcName,
 
   unsigned MemberReferenceOffset = 0;
   if (WriteSymtab) {
-    ErrorOr<unsigned> MemberReferenceOffsetOrErr =
-        writeSymbolTable(Out, NewMembers, Members, MemberOffsetRefs);
+    ErrorOr<unsigned> MemberReferenceOffsetOrErr = writeSymbolTable(
+        Out, Kind, NewMembers, Members, MemberOffsetRefs, Deterministic);
     if (auto EC = MemberReferenceOffsetOrErr.getError())
       return std::make_pair(ArcName, EC);
     MemberReferenceOffset = MemberReferenceOffsetOrErr.get();
   }
 
   std::vector<unsigned> StringMapIndexes;
-  writeStringTable(Out, NewMembers, StringMapIndexes);
+  if (Kind != object::Archive::K_BSD)
+    writeStringTable(Out, NewMembers, StringMapIndexes);
 
   unsigned MemberNum = 0;
-  unsigned LongNameMemberNum = 0;
   unsigned NewMemberNum = 0;
+  std::vector<unsigned>::iterator StringMapIndexIter = StringMapIndexes.begin();
   std::vector<unsigned> MemberOffset;
-  for (std::vector<NewArchiveIterator>::iterator I = NewMembers.begin(),
-                                                 E = NewMembers.end();
-       I != E; ++I, ++MemberNum) {
+  for (const NewArchiveIterator &I : NewMembers) {
+    MemoryBufferRef File = Members[MemberNum++];
 
     unsigned Pos = Out.tell();
     MemberOffset.push_back(Pos);
 
-    MemoryBufferRef File = Members[MemberNum];
-    if (I->isNewMember()) {
-      StringRef FileName = I->getNew();
+    sys::TimeValue ModTime;
+    unsigned UID;
+    unsigned GID;
+    unsigned Perms;
+    if (Deterministic) {
+      ModTime.fromEpochTime(0);
+      UID = 0;
+      GID = 0;
+      Perms = 0644;
+    } else if (I.isNewMember()) {
       const sys::fs::file_status &Status = NewMemberStatus[NewMemberNum];
-      NewMemberNum++;
-
-      StringRef Name = sys::path::filename(FileName);
-      if (Name.size() < 16)
-        printMemberHeader(Out, Name, Status.getLastModificationTime(),
-                          Status.getUser(), Status.getGroup(),
-                          Status.permissions(), Status.getSize());
-      else
-        printMemberHeader(Out, StringMapIndexes[LongNameMemberNum++],
-                          Status.getLastModificationTime(), Status.getUser(),
-                          Status.getGroup(), Status.permissions(),
-                          Status.getSize());
+      ModTime = Status.getLastModificationTime();
+      UID = Status.getUser();
+      GID = Status.getGroup();
+      Perms = Status.permissions();
     } else {
-      object::Archive::child_iterator OldMember = I->getOld();
-      StringRef Name = I->getName();
+      object::Archive::child_iterator OldMember = I.getOld();
+      ModTime = OldMember->getLastModified();
+      UID = OldMember->getUID();
+      GID = OldMember->getGID();
+      Perms = OldMember->getAccessMode();
+    }
 
-      if (Name.size() < 16)
-        printMemberHeader(Out, Name, OldMember->getLastModified(),
-                          OldMember->getUID(), OldMember->getGID(),
-                          OldMember->getAccessMode(), OldMember->getSize());
-      else
-        printMemberHeader(Out, StringMapIndexes[LongNameMemberNum++],
-                          OldMember->getLastModified(), OldMember->getUID(),
-                          OldMember->getGID(), OldMember->getAccessMode(),
-                          OldMember->getSize());
+    if (I.isNewMember()) {
+      StringRef FileName = I.getNew();
+      const sys::fs::file_status &Status = NewMemberStatus[NewMemberNum++];
+      printMemberHeader(Out, Kind, sys::path::filename(FileName),
+                        StringMapIndexIter, ModTime, UID, GID, Perms,
+                        Status.getSize());
+    } else {
+      object::Archive::child_iterator OldMember = I.getOld();
+      printMemberHeader(Out, Kind, I.getName(), StringMapIndexIter, ModTime,
+                        UID, GID, Perms, OldMember->getSize());
     }
 
     Out << File.getBuffer();
@@ -327,8 +385,11 @@ llvm::writeArchive(StringRef ArcName,
 
   if (MemberReferenceOffset) {
     Out.seek(MemberReferenceOffset);
-    for (unsigned MemberNum : MemberOffsetRefs)
-      print32BE(Out, MemberOffset[MemberNum]);
+    for (unsigned MemberNum : MemberOffsetRefs) {
+      if (Kind == object::Archive::K_BSD)
+        Out.seek(Out.tell() + 4); // skip over the string offset
+      print32(Out, Kind, MemberOffset[MemberNum]);
+    }
   }
 
   Output.keep();
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 64bb0d5c636d..bcca9839b475 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -154,30 +154,24 @@ ErrorOr<StringRef> COFFObjectFile::getSymbolName(DataRefImpl Ref) const {
   return Result;
 }
 
-uint64_t COFFObjectFile::getSymbolValue(DataRefImpl Ref) const {
-  COFFSymbolRef Sym = getCOFFSymbol(Ref);
-
-  if (Sym.isAnyUndefined() || Sym.isCommon())
-    return UnknownAddress;
-
-  return Sym.getValue();
+uint64_t COFFObjectFile::getSymbolValueImpl(DataRefImpl Ref) const {
+  return getCOFFSymbol(Ref).getValue();
 }
 
-std::error_code COFFObjectFile::getSymbolAddress(DataRefImpl Ref,
-                                                 uint64_t &Result) const {
-  Result = getSymbolValue(Ref);
+ErrorOr<uint64_t> COFFObjectFile::getSymbolAddress(DataRefImpl Ref) const {
+  uint64_t Result = getSymbolValue(Ref);
   COFFSymbolRef Symb = getCOFFSymbol(Ref);
   int32_t SectionNumber = Symb.getSectionNumber();
 
   if (Symb.isAnyUndefined() || Symb.isCommon() ||
       COFF::isReservedSectionNumber(SectionNumber))
-    return std::error_code();
+    return Result;
 
   const coff_section *Section = nullptr;
   if (std::error_code EC = getSection(SectionNumber, Section))
     return EC;
   Result += Section->VirtualAddress;
-  return std::error_code();
+  return Result;
 }
 
 SymbolRef::Type COFFObjectFile::getSymbolType(DataRefImpl Ref) const {
@@ -362,6 +356,8 @@ getFirstReloc(const coff_section *Sec, MemoryBufferRef M, const uint8_t *Base) {
 relocation_iterator COFFObjectFile::section_rel_begin(DataRefImpl Ref) const {
   const coff_section *Sec = toSec(Ref);
   const coff_relocation *begin = getFirstReloc(Sec, Data, base());
+  if (begin && Sec->VirtualAddress != 0)
+    report_fatal_error("Sections with relocations should have an address of 0");
   DataRefImpl Ret;
   Ret.p = reinterpret_cast<uintptr_t>(begin);
   return relocation_iterator(RelocationRef(Ret, this));
@@ -919,19 +915,15 @@ uint64_t COFFObjectFile::getSectionSize(const coff_section *Sec) const {
   // whether or not we have an executable image.
   //
   // For object files, SizeOfRawData contains the size of section's data;
-  // VirtualSize is always zero.
+  // VirtualSize should be zero but isn't due to buggy COFF writers.
   //
   // For executables, SizeOfRawData *must* be a multiple of FileAlignment; the
   // actual section size is in VirtualSize.  It is possible for VirtualSize to
   // be greater than SizeOfRawData; the contents past that point should be
   // considered to be zero.
-  uint32_t SectionSize;
-  if (Sec->VirtualSize)
-    SectionSize = std::min(Sec->VirtualSize, Sec->SizeOfRawData);
-  else
-    SectionSize = Sec->SizeOfRawData;
-
-  return SectionSize;
+  if (getDOSHeader())
+    return std::min(Sec->VirtualSize, Sec->SizeOfRawData);
+  return Sec->SizeOfRawData;
 }
 
 std::error_code
@@ -961,10 +953,6 @@ void COFFObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
             reinterpret_cast<const coff_relocation*>(Rel.p) + 1);
 }
 
-ErrorOr<uint64_t> COFFObjectFile::getRelocationAddress(DataRefImpl Rel) const {
-  report_fatal_error("getRelocationAddress not implemented in COFFObjectFile");
-}
-
 uint64_t COFFObjectFile::getRelocationOffset(DataRefImpl Rel) const {
   const coff_relocation *R = toRel(Rel);
   return R->VirtualAddress;
diff --git a/lib/Object/ELFYAML.cpp b/lib/Object/ELFYAML.cpp
index ecdd468305be..72c232c32870 100644
--- a/lib/Object/ELFYAML.cpp
+++ b/lib/Object/ELFYAML.cpp
@@ -627,6 +627,11 @@ static void sectionMapping(IO &IO, ELFYAML::RawContentSection &Section) {
   IO.mapOptional("Size", Section.Size, Hex64(Section.Content.binary_size()));
 }
 
+static void sectionMapping(IO &IO, ELFYAML::NoBitsSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Size", Section.Size, Hex64(0));
+}
+
 static void sectionMapping(IO &IO, ELFYAML::RelocationSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapOptional("Relocations", Section.Relocations);
@@ -682,6 +687,11 @@ void MappingTraits<std::unique_ptr<ELFYAML::Section>>::mapping(
       Section.reset(new ELFYAML::Group());
     groupSectionMapping(IO, *cast<ELFYAML::Group>(Section.get()));
     break;
+  case ELF::SHT_NOBITS:
+    if (!IO.outputting())
+      Section.reset(new ELFYAML::NoBitsSection());
+    sectionMapping(IO, *cast<ELFYAML::NoBitsSection>(Section.get()));
+    break;
   case ELF::SHT_MIPS_ABIFLAGS:
     if (!IO.outputting())
       Section.reset(new ELFYAML::MipsABIFlags());
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 4255ed717fb9..05900630c75c 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -368,18 +368,12 @@ std::error_code MachOObjectFile::getIndirectName(DataRefImpl Symb,
   return std::error_code();
 }
 
-uint64_t MachOObjectFile::getSymbolValue(DataRefImpl Sym) const {
-  uint64_t NValue = getNValue(Sym);
-  MachO::nlist_base Entry = getSymbolTableEntryBase(this, Sym);
-  if ((Entry.n_type & MachO::N_TYPE) == MachO::N_UNDF && NValue == 0)
-    return UnknownAddress;
-  return NValue;
+uint64_t MachOObjectFile::getSymbolValueImpl(DataRefImpl Sym) const {
+  return getNValue(Sym);
 }
 
-std::error_code MachOObjectFile::getSymbolAddress(DataRefImpl Sym,
-                                                  uint64_t &Res) const {
-  Res = getSymbolValue(Sym);
-  return std::error_code();
+ErrorOr<uint64_t> MachOObjectFile::getSymbolAddress(DataRefImpl Sym) const {
+  return getSymbolValue(Sym);
 }
 
 uint32_t MachOObjectFile::getSymbolAlignment(DataRefImpl DRI) const {
@@ -392,9 +386,7 @@ uint32_t MachOObjectFile::getSymbolAlignment(DataRefImpl DRI) const {
 }
 
 uint64_t MachOObjectFile::getCommonSymbolSizeImpl(DataRefImpl DRI) const {
-  uint64_t Value;
-  getSymbolAddress(DRI, Value);
-  return Value;
+  return getNValue(DRI);
 }
 
 SymbolRef::Type MachOObjectFile::getSymbolType(DataRefImpl Symb) const {
@@ -422,9 +414,6 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
 
   uint32_t Result = SymbolRef::SF_None;
 
-  if ((MachOType & MachO::N_TYPE) == MachO::N_UNDF)
-    Result |= SymbolRef::SF_Undefined;
-
   if ((MachOType & MachO::N_TYPE) == MachO::N_INDR)
     Result |= SymbolRef::SF_Indirect;
 
@@ -434,10 +423,10 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
   if (MachOType & MachO::N_EXT) {
     Result |= SymbolRef::SF_Global;
     if ((MachOType & MachO::N_TYPE) == MachO::N_UNDF) {
-      uint64_t Value;
-      getSymbolAddress(DRI, Value);
-      if (Value && Value != UnknownAddress)
+      if (getNValue(DRI))
         Result |= SymbolRef::SF_Common;
+      else
+        Result |= SymbolRef::SF_Undefined;
     }
 
     if (!(MachOType & MachO::N_PEXT))
@@ -593,15 +582,6 @@ void MachOObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
   ++Rel.d.b;
 }
 
-ErrorOr<uint64_t> MachOObjectFile::getRelocationAddress(DataRefImpl Rel) const {
-  uint64_t Offset = getRelocationOffset(Rel);
-
-  DataRefImpl Sec;
-  Sec.d.a = Rel.d.a;
-  uint64_t SecAddress = getSectionAddress(Sec);
-  return SecAddress + Offset;
-}
-
 uint64_t MachOObjectFile::getRelocationOffset(DataRefImpl Rel) const {
   assert(getHeader().filetype == MachO::MH_OBJECT &&
          "Only implemented for MH_OBJECT");
@@ -932,6 +912,13 @@ std::error_code MachOObjectFile::getLibraryShortNameByIndex(unsigned Index,
   return std::error_code();
 }
 
+section_iterator
+MachOObjectFile::getRelocationRelocatedSection(relocation_iterator Rel) const {
+  DataRefImpl Sec;
+  Sec.d.a = Rel->getRawDataRefImpl().d.a;
+  return section_iterator(SectionRef(Sec, this));
+}
+
 basic_symbol_iterator MachOObjectFile::symbol_begin_impl() const {
   return getSymbolByIndex(0);
 }
diff --git a/lib/Object/Object.cpp b/lib/Object/Object.cpp
index 945252b21046..5c4b7a67b2ad 100644
--- a/lib/Object/Object.cpp
+++ b/lib/Object/Object.cpp
@@ -180,10 +180,10 @@ const char *LLVMGetSymbolName(LLVMSymbolIteratorRef SI) {
 }
 
 uint64_t LLVMGetSymbolAddress(LLVMSymbolIteratorRef SI) {
-  uint64_t ret;
-  if (std::error_code ec = (*unwrap(SI))->getAddress(ret))
-    report_fatal_error(ec.message());
-  return ret;
+  ErrorOr<uint64_t> Ret = (*unwrap(SI))->getAddress();
+  if (std::error_code EC = Ret.getError())
+    report_fatal_error(EC.message());
+  return *Ret;
 }
 
 uint64_t LLVMGetSymbolSize(LLVMSymbolIteratorRef SI) {
@@ -191,13 +191,6 @@ uint64_t LLVMGetSymbolSize(LLVMSymbolIteratorRef SI) {
 }
 
 // RelocationRef accessors
-uint64_t LLVMGetRelocationAddress(LLVMRelocationIteratorRef RI) {
-  ErrorOr<uint64_t> Ret = (*unwrap(RI))->getAddress();
-  if (std::error_code EC = Ret.getError())
-    report_fatal_error(EC.message());
-  return *Ret;
-}
-
 uint64_t LLVMGetRelocationOffset(LLVMRelocationIteratorRef RI) {
   return (*unwrap(RI))->getOffset();
 }
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index 04e4916f94ef..f82edae89bc6 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -35,6 +35,15 @@ bool SectionRef::containsSymbol(SymbolRef S) const {
   return *this == *SymSec;
 }
 
+uint64_t ObjectFile::getSymbolValue(DataRefImpl Ref) const {
+  uint32_t Flags = getSymbolFlags(Ref);
+  if (Flags & SymbolRef::SF_Undefined)
+    return 0;
+  if (Flags & SymbolRef::SF_Common)
+    return getCommonSymbolSize(Ref);
+  return getSymbolValueImpl(Ref);
+}
+
 std::error_code ObjectFile::printSymbolName(raw_ostream &OS,
                                             DataRefImpl Symb) const {
   ErrorOr<StringRef> Name = getSymbolName(Symb);
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 4b0a0e5d4819..5d31225396d4 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -52,14 +52,17 @@ namespace llvm {
     /* Number of bits in the significand.  This includes the integer
        bit.  */
     unsigned int precision;
+
+    /* Number of bits actually used in the semantics. */
+    unsigned int sizeInBits;
   };
 
-  const fltSemantics APFloat::IEEEhalf = { 15, -14, 11 };
-  const fltSemantics APFloat::IEEEsingle = { 127, -126, 24 };
-  const fltSemantics APFloat::IEEEdouble = { 1023, -1022, 53 };
-  const fltSemantics APFloat::IEEEquad = { 16383, -16382, 113 };
-  const fltSemantics APFloat::x87DoubleExtended = { 16383, -16382, 64 };
-  const fltSemantics APFloat::Bogus = { 0, 0, 0 };
+  const fltSemantics APFloat::IEEEhalf = { 15, -14, 11, 16 };
+  const fltSemantics APFloat::IEEEsingle = { 127, -126, 24, 32 };
+  const fltSemantics APFloat::IEEEdouble = { 1023, -1022, 53, 64 };
+  const fltSemantics APFloat::IEEEquad = { 16383, -16382, 113, 128 };
+  const fltSemantics APFloat::x87DoubleExtended = { 16383, -16382, 64, 80 };
+  const fltSemantics APFloat::Bogus = { 0, 0, 0, 0 };
 
   /* The PowerPC format consists of two doubles.  It does not map cleanly
      onto the usual format above.  It is approximated using twice the
@@ -72,7 +75,7 @@ namespace llvm {
      to represent all possible values held by a PPC double-double number,
      for example: (long double) 1.0 + (long double) 0x1p-106
      Should this be replaced by a full emulation of PPC double-double?  */
-  const fltSemantics APFloat::PPCDoubleDouble = { 1023, -1022 + 53, 53 + 53 };
+  const fltSemantics APFloat::PPCDoubleDouble = { 1023, -1022 + 53, 53 + 53, 128 };
 
   /* A tight upper bound on number of parts required to hold the value
      pow(5, power) is
@@ -2416,7 +2419,7 @@ APFloat::roundSignificandWithExponent(const integerPart *decSigParts,
                                       roundingMode rounding_mode)
 {
   unsigned int parts, pow5PartCount;
-  fltSemantics calcSemantics = { 32767, -32767, 0 };
+  fltSemantics calcSemantics = { 32767, -32767, 0, 0 };
   integerPart pow5Parts[maxPowerOfFiveParts];
   bool isNearest;
 
@@ -3368,6 +3371,10 @@ APFloat::getAllOnesValue(unsigned BitWidth, bool isIEEE)
   }
 }
 
+unsigned APFloat::getSizeInBits(const fltSemantics &Sem) {
+  return Sem.sizeInBits;
+}
+
 /// Make this number the largest magnitude normal number in the given
 /// semantics.
 void APFloat::makeLargest(bool Negative) {
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index dcaacf6248d1..17fba95ebb2b 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -46,21 +46,21 @@ using namespace cl;
 //
 namespace llvm {
 namespace cl {
-TEMPLATE_INSTANTIATION(class basic_parser<bool>);
-TEMPLATE_INSTANTIATION(class basic_parser<boolOrDefault>);
-TEMPLATE_INSTANTIATION(class basic_parser<int>);
-TEMPLATE_INSTANTIATION(class basic_parser<unsigned>);
-TEMPLATE_INSTANTIATION(class basic_parser<unsigned long long>);
-TEMPLATE_INSTANTIATION(class basic_parser<double>);
-TEMPLATE_INSTANTIATION(class basic_parser<float>);
-TEMPLATE_INSTANTIATION(class basic_parser<std::string>);
-TEMPLATE_INSTANTIATION(class basic_parser<char>);
-
-TEMPLATE_INSTANTIATION(class opt<unsigned>);
-TEMPLATE_INSTANTIATION(class opt<int>);
-TEMPLATE_INSTANTIATION(class opt<std::string>);
-TEMPLATE_INSTANTIATION(class opt<char>);
-TEMPLATE_INSTANTIATION(class opt<bool>);
+template class basic_parser<bool>;
+template class basic_parser<boolOrDefault>;
+template class basic_parser<int>;
+template class basic_parser<unsigned>;
+template class basic_parser<unsigned long long>;
+template class basic_parser<double>;
+template class basic_parser<float>;
+template class basic_parser<std::string>;
+template class basic_parser<char>;
+
+template class opt<unsigned>;
+template class opt<int>;
+template class opt<std::string>;
+template class opt<char>;
+template class opt<bool>;
 }
 } // end namespace llvm::cl
 
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 92be0e047f62..c6646fb101b7 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -1165,6 +1165,122 @@ Triple Triple::get64BitArchVariant() const {
   return T;
 }
 
+Triple Triple::getBigEndianArchVariant() const {
+  Triple T(*this);
+  switch (getArch()) {
+  case Triple::UnknownArch:
+  case Triple::amdgcn:
+  case Triple::amdil64:
+  case Triple::amdil:
+  case Triple::hexagon:
+  case Triple::hsail64:
+  case Triple::hsail:
+  case Triple::kalimba:
+  case Triple::le32:
+  case Triple::le64:
+  case Triple::msp430:
+  case Triple::nvptx64:
+  case Triple::nvptx:
+  case Triple::r600:
+  case Triple::shave:
+  case Triple::spir64:
+  case Triple::spir:
+  case Triple::wasm32:
+  case Triple::wasm64:
+  case Triple::x86:
+  case Triple::x86_64:
+  case Triple::xcore:
+
+  // ARM is intentionally unsupported here, changing the architecture would
+  // drop any arch suffixes.
+  case Triple::arm:
+  case Triple::thumb:
+    T.setArch(UnknownArch);
+    break;
+
+  case Triple::aarch64_be:
+  case Triple::armeb:
+  case Triple::bpfeb:
+  case Triple::mips64:
+  case Triple::mips:
+  case Triple::ppc64:
+  case Triple::ppc:
+  case Triple::sparc:
+  case Triple::sparcv9:
+  case Triple::systemz:
+  case Triple::tce:
+  case Triple::thumbeb:
+    // Already big endian.
+    break;
+
+  case Triple::aarch64: T.setArch(Triple::aarch64_be); break;
+  case Triple::bpfel:   T.setArch(Triple::bpfeb);      break;
+  case Triple::mips64el:T.setArch(Triple::mips64);     break;
+  case Triple::mipsel:  T.setArch(Triple::mips);       break;
+  case Triple::ppc64le: T.setArch(Triple::ppc64);      break;
+  case Triple::sparcel: T.setArch(Triple::sparc);      break;
+  }
+  return T;
+}
+
+Triple Triple::getLittleEndianArchVariant() const {
+  Triple T(*this);
+  switch (getArch()) {
+  case Triple::UnknownArch:
+  case Triple::ppc:
+  case Triple::sparcv9:
+  case Triple::systemz:
+  case Triple::tce:
+
+  // ARM is intentionally unsupported here, changing the architecture would
+  // drop any arch suffixes.
+  case Triple::armeb:
+  case Triple::thumbeb:
+    T.setArch(UnknownArch);
+    break;
+
+  case Triple::aarch64:
+  case Triple::amdgcn:
+  case Triple::amdil64:
+  case Triple::amdil:
+  case Triple::arm:
+  case Triple::bpfel:
+  case Triple::hexagon:
+  case Triple::hsail64:
+  case Triple::hsail:
+  case Triple::kalimba:
+  case Triple::le32:
+  case Triple::le64:
+  case Triple::mips64el:
+  case Triple::mipsel:
+  case Triple::msp430:
+  case Triple::nvptx64:
+  case Triple::nvptx:
+  case Triple::ppc64le:
+  case Triple::r600:
+  case Triple::shave:
+  case Triple::sparcel:
+  case Triple::spir64:
+  case Triple::spir:
+  case Triple::thumb:
+  case Triple::wasm32:
+  case Triple::wasm64:
+  case Triple::x86:
+  case Triple::x86_64:
+  case Triple::xcore:
+    // Already little endian.
+    break;
+
+  case Triple::aarch64_be: T.setArch(Triple::aarch64);  break;
+  case Triple::bpfeb:      T.setArch(Triple::bpfel);    break;
+  case Triple::mips64:     T.setArch(Triple::mips64el); break;
+  case Triple::mips:       T.setArch(Triple::mipsel);   break;
+  case Triple::ppc64:      T.setArch(Triple::ppc64le);  break;
+  case Triple::sparc:      T.setArch(Triple::sparcel);  break;
+  }
+  return T;
+}
+
 const char *Triple::getARMCPUForArch(StringRef MArch) const {
   if (MArch.empty())
     MArch = getArchName();
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 6e982bf1da19..c9a31b64cfd3 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -1648,7 +1648,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) {
   }
 
   OS << " {";
-  const std::vector<Record*> &SC = R.getSuperClasses();
+  ArrayRef<Record *> SC = R.getSuperClasses();
   if (!SC.empty()) {
     OS << "\t//";
     for (const Record *Super : SC)
diff --git a/lib/TableGen/SetTheory.cpp b/lib/TableGen/SetTheory.cpp
index 92f5b2dd7172..07c538159dcb 100644
--- a/lib/TableGen/SetTheory.cpp
+++ b/lib/TableGen/SetTheory.cpp
@@ -302,7 +302,7 @@ const RecVec *SetTheory::expand(Record *Set) {
     return &I->second;
 
   // This is the first time we see Set. Find a suitable expander.
-  const std::vector<Record*> &SC = Set->getSuperClasses();
+  ArrayRef<Record *> SC = Set->getSuperClasses();
   for (unsigned i = 0, e = SC.size(); i != e; ++i) {
     // Skip unnamed superclasses.
     if (!dyn_cast<StringInit>(SC[i]->getNameInit()))
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 15df25aea50e..5c36fda2e1ca 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -184,7 +184,7 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
 
   // Since everything went well, we can now set the "superclass" list for the
   // current record.
-  const std::vector<Record*> &SCs = SC->getSuperClasses();
+  ArrayRef<Record *> SCs = SC->getSuperClasses();
   ArrayRef<SMRange> SCRanges = SC->getSuperClassRanges();
   for (unsigned i = 0, e = SCs.size(); i != e; ++i) {
     if (CurRec->isSubClassOf(SCs[i]))
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index bffd9e6e8c76..79a84ad8c6c5 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -510,9 +510,17 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
       if (J.isRegMask())
         AvailableRegs.clearBitsNotInMask(J.getRegMask());
 
-      if (J.isReg() && J.isDef() && AvailableRegs[J.getReg()]) {
-        assert(J.isDead() && "Non-dead def should have been removed by now!");
-        AvailableRegs.reset(J.getReg());
+      if (J.isReg() && J.isDef()) {
+        MCRegAliasIterator AI(J.getReg(), TRI, /*IncludeSelf=*/true);
+        if (J.isDead())
+          for (; AI.isValid(); ++AI)
+            AvailableRegs.reset(*AI);
+#ifndef NDEBUG
+        else
+          for (; AI.isValid(); ++AI)
+            assert(!AvailableRegs[*AI] &&
+                   "Non-dead def should have been removed by now!");
+#endif
       }
     }
   }
@@ -585,7 +593,6 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
       if (Change) {
         Substs[MO.getReg()] = Reg;
         MO.setReg(Reg);
-        MRI->setPhysRegUsed(Reg);
 
         Changed = true;
       }
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 4691e949838d..815ebef177d8 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -40,6 +40,11 @@ def CC_AArch64_AAPCS : CallingConv<[
   // slot is 64-bit.
   CCIfByVal<CCPassByVal<8, 8>>,
 
+  // The 'nest' parameter, if any, is passed in X18.
+  // Darwin uses X18 as the platform register and hence 'nest' isn't currently
+  // supported there.
+  CCIfNest<CCAssignToReg<[X18]>>,
+
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index c19fcdc4bb18..072819836bb3 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -310,7 +310,7 @@ CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
 }
 
 unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) {
-  assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
+  assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i64 &&
          "Alloca should always return a pointer.");
 
   // Don't handle dynamic allocas.
@@ -420,7 +420,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
 
   unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
 
-  EVT DestEVT = TLI.getValueType(GV->getType(), true);
+  EVT DestEVT = TLI.getValueType(DL, GV->getType(), true);
   if (!DestEVT.isSimple())
     return 0;
 
@@ -459,7 +459,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
 }
 
 unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
 
   // Only handle simple types.
   if (!CEVT.isSimple())
@@ -538,13 +538,14 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
   }
   case Instruction::IntToPtr: {
     // Look past no-op inttoptrs.
-    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
       return computeAddress(U->getOperand(0), Addr, Ty);
     break;
   }
   case Instruction::PtrToInt: {
     // Look past no-op ptrtoints.
-    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return computeAddress(U->getOperand(0), Addr, Ty);
     break;
   }
@@ -879,13 +880,13 @@ bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs if its operand is in the same BB.
     if (InMBB &&
-        TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+        TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+            TLI.getPointerTy(DL))
       return computeCallAddress(U->getOperand(0), Addr);
     break;
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints if its operand is in the same BB.
-    if (InMBB &&
-        TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return computeCallAddress(U->getOperand(0), Addr);
     break;
   }
@@ -906,7 +907,7 @@ bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
 
 
 bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
-  EVT evt = TLI.getValueType(Ty, true);
+  EVT evt = TLI.getValueType(DL, Ty, true);
 
   // Only handle simple types.
   if (evt == MVT::Other || !evt.isSimple())
@@ -1390,7 +1391,7 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
 
 bool AArch64FastISel::emitCmp(const Value *LHS, const Value *RHS, bool IsZExt) {
   Type *Ty = LHS->getType();
-  EVT EVT = TLI.getValueType(Ty, true);
+  EVT EVT = TLI.getValueType(DL, Ty, true);
   if (!EVT.isSimple())
     return false;
   MVT VT = EVT.getSimpleVT();
@@ -2761,7 +2762,7 @@ bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
   if (SrcReg == 0)
     return false;
 
-  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
   if (SrcVT == MVT::f128)
     return false;
 
@@ -2797,7 +2798,7 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
     return false;
   bool SrcIsKill = hasTrivialKill(I->getOperand(0));
 
-  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
 
   // Handle sign-extension.
   if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
@@ -2856,7 +2857,7 @@ bool AArch64FastISel::fastLowerArguments() {
     if (ArgTy->isStructTy() || ArgTy->isArrayTy())
       return false;
 
-    EVT ArgVT = TLI.getValueType(ArgTy);
+    EVT ArgVT = TLI.getValueType(DL, ArgTy);
     if (!ArgVT.isSimple())
       return false;
 
@@ -2898,7 +2899,7 @@ bool AArch64FastISel::fastLowerArguments() {
   unsigned GPRIdx = 0;
   unsigned FPRIdx = 0;
   for (auto const &Arg : F->args()) {
-    MVT VT = TLI.getSimpleValueType(Arg.getType());
+    MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
     unsigned SrcReg;
     const TargetRegisterClass *RC;
     if (VT >= MVT::i1 && VT <= MVT::i32) {
@@ -3689,7 +3690,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
   if (Ret->getNumOperands() > 0) {
     CallingConv::ID CC = F.getCallingConv();
     SmallVector<ISD::OutputArg, 4> Outs;
-    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
@@ -3724,7 +3725,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
     if (!MRI.getRegClass(SrcReg)->contains(DestReg))
       return false;
 
-    EVT RVEVT = TLI.getValueType(RV->getType());
+    EVT RVEVT = TLI.getValueType(DL, RV->getType());
     if (!RVEVT.isSimple())
       return false;
 
@@ -3772,8 +3773,8 @@ bool AArch64FastISel::selectTrunc(const Instruction *I) {
   Value *Op = I->getOperand(0);
   Type *SrcTy = Op->getType();
 
-  EVT SrcEVT = TLI.getValueType(SrcTy, true);
-  EVT DestEVT = TLI.getValueType(DestTy, true);
+  EVT SrcEVT = TLI.getValueType(DL, SrcTy, true);
+  EVT DestEVT = TLI.getValueType(DL, DestTy, true);
   if (!SrcEVT.isSimple())
     return false;
   if (!DestEVT.isSimple())
@@ -4459,7 +4460,7 @@ bool AArch64FastISel::selectIntExt(const Instruction *I) {
 }
 
 bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) {
-  EVT DestEVT = TLI.getValueType(I->getType(), true);
+  EVT DestEVT = TLI.getValueType(DL, I->getType(), true);
   if (!DestEVT.isSimple())
     return false;
 
@@ -4825,7 +4826,7 @@ std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
   bool IdxNIsKill = hasTrivialKill(Idx);
 
   // If the index is smaller or larger than intptr_t, truncate or extend it.
-  MVT PtrVT = TLI.getPointerTy();
+  MVT PtrVT = TLI.getPointerTy(DL);
   EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
   if (IdxVT.bitsLT(PtrVT)) {
     IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false);
@@ -4849,7 +4850,7 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
   // into a single N = N + TotalOffset.
   uint64_t TotalOffs = 0;
   Type *Ty = I->getOperand(0)->getType();
-  MVT VT = TLI.getPointerTy();
+  MVT VT = TLI.getPointerTy(DL);
   for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) {
     const Value *Idx = *OI;
     if (auto *StTy = dyn_cast<StructType>(Ty)) {
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3ba7e70a102d..a7817f4f67dd 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -349,12 +349,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // Allocate space for the rest of the frame.
 
   const unsigned Alignment = MFI->getMaxAlignment();
-  const bool NeedsRealignment = (Alignment > 16);
+  const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
   unsigned scratchSPReg = AArch64::SP;
-  if (NeedsRealignment) {
-    // Use the first callee-saved register as a scratch register
-    assert(MF.getRegInfo().isPhysRegUsed(AArch64::X9) &&
-           "No scratch register to align SP!");
+  if (NumBytes && NeedsRealignment) {
+    // Use the first callee-saved register as a scratch register.
     scratchSPReg = AArch64::X9;
   }
 
@@ -366,9 +364,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
                     MachineInstr::FrameSetup);
 
-  assert(!(NeedsRealignment && NumBytes==0) &&
-         "NumBytes should never be 0 when realignment is needed");
-
   if (NumBytes && NeedsRealignment) {
     const unsigned NrBitsToZero = countTrailingZeros(Alignment);
     assert(NrBitsToZero > 1);
@@ -881,28 +876,34 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
   return true;
 }
 
-void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
-    MachineFunction &MF, RegScavenger *RS) const {
+void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                BitVector &SavedRegs,
+                                                RegScavenger *RS) const {
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  MachineRegisterInfo *MRI = &MF.getRegInfo();
   SmallVector<unsigned, 4> UnspilledCSGPRs;
   SmallVector<unsigned, 4> UnspilledCSFPRs;
 
   // The frame record needs to be created by saving the appropriate registers
   if (hasFP(MF)) {
-    MRI->setPhysRegUsed(AArch64::FP);
-    MRI->setPhysRegUsed(AArch64::LR);
+    SavedRegs.set(AArch64::FP);
+    SavedRegs.set(AArch64::LR);
   }
 
   // Spill the BasePtr if it's used. Do this first thing so that the
   // getCalleeSavedRegs() below will get the right answer.
   if (RegInfo->hasBasePointer(MF))
-    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
+    SavedRegs.set(RegInfo->getBaseRegister());
 
   if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF))
-    MRI->setPhysRegUsed(AArch64::X9);
+    SavedRegs.set(AArch64::X9);
 
   // If any callee-saved registers are used, the frame cannot be eliminated.
   unsigned NumGPRSpilled = 0;
@@ -924,8 +925,8 @@ void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
                 AArch64::FPR64RegClass.contains(EvenReg)) &&
            "Register class mismatch!");
 
-    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
-    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
+    const bool OddRegUsed = SavedRegs.test(OddReg);
+    const bool EvenRegUsed = SavedRegs.test(EvenReg);
 
     // Early exit if none of the registers in the register pair is actually
     // used.
@@ -946,7 +947,7 @@ void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
     if (OddRegUsed ^ EvenRegUsed) {
       // Find out which register is the additional spill.
       Reg = OddRegUsed ? EvenReg : OddReg;
-      MRI->setPhysRegUsed(Reg);
+      SavedRegs.set(Reg);
     }
 
     DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
@@ -1001,7 +1002,7 @@ void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
       UnspilledCSGPRs.pop_back();
       DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
                    << " to get a scratch register.\n");
-      MRI->setPhysRegUsed(Reg);
+      SavedRegs.set(Reg);
       ExtraCSSpill = true;
       ++Count;
     }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index b496fccba349..731f031ff855 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -59,8 +59,8 @@ public:
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 1ea4abcf05fa..772e894f4f0a 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -610,10 +610,11 @@ static bool isWorthFoldingADDlow(SDValue N) {
 bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
                                               SDValue &Base, SDValue &OffImm) {
   SDLoc dl(N);
+  const DataLayout &DL = CurDAG->getDataLayout();
   const TargetLowering *TLI = getTargetLowering();
   if (N.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
     OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
     return true;
   }
@@ -628,10 +629,9 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
 
     const GlobalValue *GV = GAN->getGlobal();
     unsigned Alignment = GV->getAlignment();
-    const DataLayout *DL = TLI->getDataLayout();
     Type *Ty = GV->getType()->getElementType();
     if (Alignment == 0 && Ty->isSized())
-      Alignment = DL->getABITypeAlignment(Ty);
+      Alignment = DL.getABITypeAlignment(Ty);
 
     if (Alignment >= Size)
       return true;
@@ -645,7 +645,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
         Base = N.getOperand(0);
         if (Base.getOpcode() == ISD::FrameIndex) {
           int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
         }
         OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
         return true;
@@ -688,7 +688,8 @@ bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
         const TargetLowering *TLI = getTargetLowering();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64);
       return true;
@@ -1494,7 +1495,7 @@ static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
 }
 
 static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                       unsigned &LSB, unsigned &MSB,
+                                       unsigned &Immr, unsigned &Imms,
                                        bool BiggerPattern) {
   assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
          "N must be a SHR/SRA operation to call this function");
@@ -1508,7 +1509,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
          "Type checking must have been done before calling this function");
 
   // Check for AND + SRL doing several bits extract.
-  if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+  if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
     return true;
 
   // we're looking for a shift of a shift
@@ -1548,13 +1549,9 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
 
   assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
          "bad amount in shift node!");
-  // Note: The width operand is encoded as width-1.
-  unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
-  int sLSB = Srl_imm - Shl_imm;
-  if (sLSB < 0)
-    return false;
-  LSB = sLSB;
-  MSB = LSB + Width;
+  int immr = Srl_imm - Shl_imm;
+  Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
+  Imms = VT.getSizeInBits() - Shl_imm - Trunc_bits - 1;
   // SRA requires a signed extraction
   if (VT == MVT::i32)
     Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
@@ -1564,7 +1561,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
 }
 
 static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
-                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
+                                SDValue &Opd0, unsigned &Immr, unsigned &Imms,
                                 unsigned NumberOfIgnoredLowBits = 0,
                                 bool BiggerPattern = false) {
   if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
@@ -1576,11 +1573,11 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
       return false;
     break;
   case ISD::AND:
-    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
+    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms,
                                       NumberOfIgnoredLowBits, BiggerPattern);
   case ISD::SRL:
   case ISD::SRA:
-    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
+    return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
   }
 
   unsigned NOpc = N->getMachineOpcode();
@@ -1593,8 +1590,8 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
   case AArch64::UBFMXri:
     Opc = NOpc;
     Opd0 = N->getOperand(0);
-    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+    Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+    Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
     return true;
   }
   // Unreachable
@@ -1602,9 +1599,9 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
 }
 
 SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
-  unsigned Opc, LSB, MSB;
+  unsigned Opc, Immr, Imms;
   SDValue Opd0;
-  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
+  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
     return nullptr;
 
   EVT VT = N->getValueType(0);
@@ -1613,8 +1610,8 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
   // If the bit extract operation is 64bit but the original type is 32bit, we
   // need to add one EXTRACT_SUBREG.
   if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
-    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, dl, MVT::i64),
-                       CurDAG->getTargetConstant(MSB, dl, MVT::i64)};
+    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64),
+                       CurDAG->getTargetConstant(Imms, dl, MVT::i64)};
 
     SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
@@ -1624,8 +1621,8 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
     return Node;
   }
 
-  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, dl, VT),
-                   CurDAG->getTargetConstant(MSB, dl, VT)};
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
+                   CurDAG->getTargetConstant(Imms, dl, VT)};
   return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
 }
 
@@ -2351,7 +2348,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
     unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
     const TargetLowering *TLI = getTargetLowering();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    SDValue TFI = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     SDLoc DL(Node);
     SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
                       CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index f3242cdd971d..3e8f46cf1ecd 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -705,7 +705,8 @@ void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
   addTypeForNEON(VT, MVT::v4i32);
 }
 
-EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+                                              EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
@@ -774,7 +775,8 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
   }
 }
 
-MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
+MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
+                                                  EVT) const {
   return MVT::i64;
 }
 
@@ -1710,7 +1712,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 
   const char *LibcallName =
       (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+  SDValue Callee =
+      DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
 
   StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -2089,7 +2092,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       CurArgIdx = Ins[i].getOrigArgIndex();
 
       // Get type of the original argument.
-      EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+      EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
+                                  /*AllowUnknown*/ true);
       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
@@ -2111,7 +2115,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     if (Ins[i].Flags.isByVal()) {
       // Byval is used for HFAs in the PCS, but the system should work in a
       // non-compliant manner for larger structs.
-      EVT PtrTy = getPointerTy();
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
       int Size = Ins[i].Flags.getByValSize();
       unsigned NumRegs = (Size + 7) / 8;
 
@@ -2119,7 +2123,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       // case. It should also work for fundamental types too.
       unsigned FrameIdx =
         MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
-      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
+      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
       InVals.push_back(FrameIdxN);
 
       continue;
@@ -2186,7 +2190,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
 
       // Create load nodes to retrieve arguments from the stack.
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
       SDValue ArgValue;
 
       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
@@ -2265,6 +2269,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   SmallVector<SDValue, 8> MemOps;
 
@@ -2279,7 +2284,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   if (GPRSaveSize != 0) {
     GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
 
-    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+    SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
 
     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
       unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
@@ -2288,8 +2293,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
           DAG.getStore(Val.getValue(1), DL, Val, FIN,
                        MachinePointerInfo::getStack(i * 8), false, false, 0);
       MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(8, DL, getPointerTy()));
+      FIN =
+          DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
     }
   }
   FuncInfo->setVarArgsGPRIndex(GPRIdx);
@@ -2307,7 +2312,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
     if (FPRSaveSize != 0) {
       FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
 
-      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+      SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
 
       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
         unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
@@ -2317,8 +2322,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
             DAG.getStore(Val.getValue(1), DL, Val, FIN,
                          MachinePointerInfo::getStack(i * 16), false, false, 0);
         MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                          DAG.getConstant(16, DL, getPointerTy()));
+        FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
+                          DAG.getConstant(16, DL, PtrVT));
       }
     }
     FuncInfo->setVarArgsFPRIndex(FPRIdx);
@@ -2614,7 +2619,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     for (unsigned i = 0; i != NumArgs; ++i) {
       MVT ValVT = Outs[i].VT;
       // Get type of the original argument.
-      EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
+      EVT ActualVT = getValueType(DAG.getDataLayout(),
+                                  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
                                   /*AllowUnknown*/ true);
       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
@@ -2674,10 +2680,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                                               true),
                                  DL);
 
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
+                                        getPointerTy(DAG.getDataLayout()));
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
@@ -2743,13 +2751,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       unsigned LocMemOffset = VA.getLocMemOffset();
       int32_t Offset = LocMemOffset + BEAlign;
       SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
-      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+      PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
 
       if (IsTailCall) {
         Offset = Offset + FPDiff;
         int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
 
-        DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+        DstAddr = DAG.getFrameIndex(FI, PtrVT);
         DstInfo = MachinePointerInfo::getFixedStack(FI);
 
         // Make sure any stack arguments overlapping with where we're storing
@@ -2759,7 +2767,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       } else {
         SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
 
-        DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+        DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
         DstInfo = MachinePointerInfo::getStack(LocMemOffset);
       }
 
@@ -2809,25 +2817,24 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       const GlobalValue *GV = G->getGlobal();
       bool InternalLinkage = GV->hasInternalLinkage();
       if (InternalLinkage)
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+        Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
       else {
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
-                                            AArch64II::MO_GOT);
-        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+        Callee =
+            DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
+        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
       }
     } else if (ExternalSymbolSDNode *S =
                    dyn_cast<ExternalSymbolSDNode>(Callee)) {
       const char *Sym = S->getSymbol();
-      Callee =
-          DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
-      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+      Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
+      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
     }
   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = G->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+    Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const char *Sym = S->getSymbol();
-    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
+    Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
   }
 
   // We don't usually want to end the call-sequence here because we would tidy
@@ -2977,7 +2984,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
                                                   SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GN->getGlobal();
@@ -3069,7 +3076,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
   assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
 
   SDLoc DL(Op);
-  MVT PtrVT = getPointerTy();
+  MVT PtrVT = getPointerTy(DAG.getDataLayout());
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
   SDValue TLVPAddr =
@@ -3124,7 +3131,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
 ///  the sequence is produced as per above.
 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
                                                       SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   SDValue Chain = DAG.getEntryNode();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -3159,7 +3166,7 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
   }
 
   SDValue TPOff;
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
   const GlobalValue *GV = GA->getGlobal();
 
@@ -3786,7 +3793,7 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
   // Jump table entries as PC relative offsets. No additional tweaking
   // is necessary here. Just get the address of the jump table.
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
@@ -3812,7 +3819,7 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
                                                  SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
@@ -3853,7 +3860,7 @@ SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
       !Subtarget->isTargetMachO()) {
@@ -3879,8 +3886,8 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
 
   SDLoc DL(Op);
-  SDValue FR =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
+                                 getPointerTy(DAG.getDataLayout()));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                       MachinePointerInfo(SV), false, false, 0);
@@ -3892,6 +3899,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
   // Standard, section B.3.
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
 
   SDValue Chain = Op.getOperand(0);
@@ -3900,8 +3908,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
   SmallVector<SDValue, 4> MemOps;
 
   // void *__stack at offset 0
-  SDValue Stack =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
                                 MachinePointerInfo(SV), false, false, 8));
 
@@ -3910,12 +3917,12 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
   if (GPRSize > 0) {
     SDValue GRTop, GRTopAddr;
 
-    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(8, DL, getPointerTy()));
+    GRTopAddr =
+        DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
 
-    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
-    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
-                        DAG.getConstant(GPRSize, DL, getPointerTy()));
+    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
+    GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
+                        DAG.getConstant(GPRSize, DL, PtrVT));
 
     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
                                   MachinePointerInfo(SV, 8), false, false, 8));
@@ -3925,28 +3932,28 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
   int FPRSize = FuncInfo->getVarArgsFPRSize();
   if (FPRSize > 0) {
     SDValue VRTop, VRTopAddr;
-    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(16, DL, getPointerTy()));
+    VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                            DAG.getConstant(16, DL, PtrVT));
 
-    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
-    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
-                        DAG.getConstant(FPRSize, DL, getPointerTy()));
+    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
+    VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
+                        DAG.getConstant(FPRSize, DL, PtrVT));
 
     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
                                   MachinePointerInfo(SV, 16), false, false, 8));
   }
 
   // int __gr_offs at offset 24
-  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(24, DL, getPointerTy()));
+  SDValue GROffsAddr =
+      DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
   MemOps.push_back(DAG.getStore(Chain, DL,
                                 DAG.getConstant(-GPRSize, DL, MVT::i32),
                                 GROffsAddr, MachinePointerInfo(SV, 24), false,
                                 false, 4));
 
   // int __vr_offs at offset 28
-  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(28, DL, getPointerTy()));
+  SDValue VROffsAddr =
+      DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
   MemOps.push_back(DAG.getStore(Chain, DL,
                                 DAG.getConstant(-FPRSize, DL, MVT::i32),
                                 VROffsAddr, MachinePointerInfo(SV, 28), false,
@@ -3987,21 +3994,22 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Addr = Op.getOperand(1);
   unsigned Align = Op.getConstantOperandVal(3);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
-                               MachinePointerInfo(V), false, false, false, 0);
+  SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V),
+                               false, false, false, 0);
   Chain = VAList.getValue(1);
 
   if (Align > 8) {
     assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
-    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                         DAG.getConstant(Align - 1, DL, getPointerTy()));
-    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
-                         DAG.getConstant(-(int64_t)Align, DL, getPointerTy()));
+    VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                         DAG.getConstant(Align - 1, DL, PtrVT));
+    VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
+                         DAG.getConstant(-(int64_t)Align, DL, PtrVT));
   }
 
   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
-  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+  uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
 
   // Scalar integer and FP values smaller than 64 bits are implicitly extended
   // up to 64 bits.  At the very least, we have to increase the striding of the
@@ -4016,8 +4024,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Increment the pointer, VAList, to the next vaarg
-  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                               DAG.getConstant(ArgSize, DL, getPointerTy()));
+  SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                               DAG.getConstant(ArgSize, DL, PtrVT));
   // Store the incremented VAList to the legalized pointer
   SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
                                  false, false, 0);
@@ -4057,8 +4065,8 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
-                                                  EVT VT) const {
+unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                                  SelectionDAG &DAG) const {
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("sp", AArch64::SP)
                        .Default(0);
@@ -4079,7 +4087,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset = DAG.getConstant(8, DL, getPointerTy());
+    SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
     return DAG.getLoad(VT, DL, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
                        MachinePointerInfo(), false, false, false, 0);
@@ -4232,7 +4240,7 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 AArch64TargetLowering::ConstraintType
-AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
+AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:
@@ -4283,8 +4291,7 @@ AArch64TargetLowering::getSingleConstraintMatchWeight(
 
 std::pair<unsigned, const TargetRegisterClass *>
 AArch64TargetLowering::getRegForInlineAsmConstraint(
-    const TargetRegisterInfo *TRI, const std::string &Constraint,
-    MVT VT) const {
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
@@ -4320,10 +4327,9 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
     unsigned Size = Constraint.size();
     if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
         tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
-      const std::string Reg =
-          std::string(&Constraint[2], &Constraint[Size - 1]);
-      int RegNo = atoi(Reg.c_str());
-      if (RegNo >= 0 && RegNo <= 31) {
+      int RegNo;
+      bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
+      if (!Failed && RegNo >= 0 && RegNo <= 31) {
         // v0 - v31 are aliases of q0 - q31.
         // By default we'll emit v0-v31 for this unless there's a modifier where
         // we'll emit the correct register as well.
@@ -6429,6 +6435,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                                const CallInst &I,
                                                unsigned Intrinsic) const {
+  auto &DL = I.getModule()->getDataLayout();
   switch (Intrinsic) {
   case Intrinsic::aarch64_neon_ld2:
   case Intrinsic::aarch64_neon_ld3:
@@ -6444,7 +6451,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_ld4r: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
@@ -6470,7 +6477,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
@@ -6488,7 +6495,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
     Info.vol = true;
     Info.readMem = true;
     Info.writeMem = false;
@@ -6501,7 +6508,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
     Info.vol = true;
     Info.readMem = false;
     Info.writeMem = true;
@@ -6572,7 +6579,8 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
     return true;
 
   const TargetOptions &Options = getTargetMachine().Options;
-  EVT VT = getValueType(User->getOperand(0)->getType());
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  EVT VT = getValueType(DL, User->getOperand(0)->getType());
 
   if (isFMAFasterThanFMulAndFAdd(VT) &&
       isOperationLegalOrCustom(ISD::FMA, VT) &&
@@ -6637,6 +6645,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
       break;
     case Instruction::GetElementPtr: {
       gep_type_iterator GTI = gep_type_begin(Instr);
+      auto &DL = Ext->getModule()->getDataLayout();
       std::advance(GTI, U.getOperandNo());
       Type *IdxTy = *GTI;
       // This extension will end up with a shift because of the scaling factor.
@@ -6644,7 +6653,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
       // Get the shift amount based on the scaling factor:
       // log2(sizeof(IdxTy)) - log2(8).
       uint64_t ShiftAmt =
-        countTrailingZeros(getDataLayout()->getTypeStoreSizeInBits(IdxTy)) - 3;
+          countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
       // Is the constant foldable in the shift of the addressing mode?
       // I.e., shift amount is between 1 and 4 inclusive.
       if (ShiftAmt == 0 || ShiftAmt > 4)
@@ -6708,10 +6717,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
-  const DataLayout *DL = getDataLayout();
+  const DataLayout &DL = LI->getModule()->getDataLayout();
 
   VectorType *VecTy = Shuffles[0]->getType();
-  unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy);
+  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
 
   // Skip illegal vector types.
   if (VecSize != 64 && VecSize != 128)
@@ -6721,8 +6730,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   // load integer vectors first and then convert to pointer vectors.
   Type *EltTy = VecTy->getVectorElementType();
   if (EltTy->isPointerTy())
-    VecTy = VectorType::get(DL->getIntPtrType(EltTy),
-                            VecTy->getVectorNumElements());
+    VecTy =
+        VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
   Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
   Type *Tys[2] = {VecTy, PtrTy};
@@ -6796,8 +6805,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   Type *EltTy = VecTy->getVectorElementType();
   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
 
-  const DataLayout *DL = getDataLayout();
-  unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy);
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
 
   // Skip illegal vector types.
   if (SubVecSize != 64 && SubVecSize != 128)
@@ -6810,7 +6819,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
   // vectors to integer vectors.
   if (EltTy->isPointerTy()) {
-    Type *IntTy = DL->getIntPtrType(EltTy);
+    Type *IntTy = DL.getIntPtrType(EltTy);
     unsigned NumOpElts =
         dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
 
@@ -6894,8 +6903,8 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
 
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
-bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                  Type *Ty,
+bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                  const AddrMode &AM, Type *Ty,
                                                   unsigned AS) const {
   // AArch64 has five basic addressing modes:
   //  reg
@@ -6916,7 +6925,7 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
   uint64_t NumBytes = 0;
   if (Ty->isSized()) {
-    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
+    uint64_t NumBits = DL.getTypeSizeInBits(Ty);
     NumBytes = NumBits / 8;
     if (!isPowerOf2_64(NumBits))
       NumBytes = 0;
@@ -6946,8 +6955,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
   return false;
 }
 
-int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                                Type *Ty,
+int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
                                                 unsigned AS) const {
   // Scaling factors are not free at all.
   // Operands                     | Rt Latency
@@ -6956,7 +6965,7 @@ int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
   // -------------------------------------------
   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
   // Rt, [Xn, Wm, <extend> #imm]  |
-  if (isLegalAddressingMode(AM, Ty, AS))
+  if (isLegalAddressingMode(DL, AM, Ty, AS))
     // Scale represents reg2 * scale, thus account for 1 if
     // it is not equal to 0 or 1.
     return AM.Scale != 0 && AM.Scale != 1;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 46298c0e7de1..c73ce1e54b3e 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -233,7 +233,7 @@ public:
                                      APInt &KnownOne, const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
-  MVT getScalarShiftAmountTy(EVT LHSTy) const override;
+  MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
 
   /// allowsMisalignedMemoryAccesses - Returns true if the target allows
   /// unaligned memory accesses of the specified type.
@@ -278,7 +278,8 @@ public:
   bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
 
   /// getSetCCResultType - Return the ISD::SETCC ValueType
-  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
 
   SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
@@ -323,7 +324,7 @@ public:
 
   /// isLegalAddressingMode - Return true if the addressing mode represented
   /// by AM is legal for this target, for a load/store of the specified type.
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
   /// \brief Return the cost of the scaling factor used in the addressing
@@ -331,7 +332,7 @@ public:
   /// of the specified type.
   /// If the AM is supported, the return value must be >= 0.
   /// If the AM is not supported, it returns a negative value.
-  int getScalingFactorCost(const AddrMode &AM, Type *Ty,
+  int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                            unsigned AS) const override;
 
   /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
@@ -471,9 +472,9 @@ private:
                         std::vector<SDNode *> *Created) const override;
   bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
 
-  ConstraintType
-  getConstraintType(const std::string &Constraint) const override;
-  unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+  ConstraintType getConstraintType(StringRef Constraint) const override;
+  unsigned getRegisterByName(const char* RegName, EVT VT,
+                             SelectionDAG &DAG) const override;
 
   /// Examine constraint string and operand type and determine a weight value.
   /// The operand object must already have been set up with the operand type.
@@ -483,14 +484,12 @@ private:
 
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                               const std::string &Constraint,
-                               MVT VT) const override;
+                               StringRef Constraint, MVT VT) const override;
   void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
 
-  unsigned getInlineAsmMemConstraint(
-      const std::string &ConstraintCode) const override {
+  unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
     if (ConstraintCode == "Q")
       return InlineAsm::Constraint_Q;
     // FIXME: clang has code for 'Ump', 'Utf', 'Usa', and 'Ush' but these are
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index b73e0958df90..fa1a46acba84 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -894,6 +894,8 @@ def REVXr   : OneXRegData<0b011, "rev", bswap>;
 def REV32Xr : OneXRegData<0b010, "rev32",
                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
 
+def : InstAlias<"rev64 $Rd, $Rn", (REVXr GPR64:$Rd, GPR64:$Rn), 0>;
+
 // The bswap commutes with the rotr so we want a pattern for both possible
 // orders.
 def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
@@ -5283,18 +5285,23 @@ def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
@@ -5309,12 +5316,16 @@ def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 1836682e386e..841af55f7a65 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -90,7 +90,7 @@ AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
 
 BitVector
 AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
 
   // FIXME: avoid re-calculating this every time.
   BitVector Reserved(getNumRegs());
@@ -119,7 +119,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
 bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
                                       unsigned Reg) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
 
   switch (Reg) {
   default:
@@ -198,11 +198,9 @@ bool AArch64RegisterInfo::canRealignStack(const MachineFunction &MF) const {
 bool
 AArch64RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
   const Function *F = MF.getFunction();
-  unsigned StackAlign = MF.getTarget()
-                            .getSubtargetImpl(*MF.getFunction())
-                            ->getFrameLowering()
-                            ->getStackAlignment();
+  unsigned StackAlign = TFI->getStackAlignment();
   bool requiresRealignment =
       ((MFI->getMaxAlignment() > StackAlign) ||
        F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
@@ -213,8 +211,7 @@ AArch64RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
 
 unsigned
 AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
-
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
   return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
 }
 
@@ -280,7 +277,7 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
   // Note that the incoming offset is based on the SP value at function entry,
   // so it'll be negative.
   MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // Estimate an offset from the frame pointer.
@@ -376,8 +373,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineFunction &MF = *MBB.getParent();
   const AArch64InstrInfo *TII =
       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
-  const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>(
-      MF.getSubtarget().getFrameLowering());
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
 
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   unsigned FrameReg;
@@ -415,7 +411,7 @@ namespace llvm {
 
 unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                                   MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
 
   switch (RC->getID()) {
   default:
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index b9c53998752a..f40293021d74 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -16,11 +16,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-selectiondag-info"
 
-AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const DataLayout *DL)
-    : TargetSelectionDAGInfo(DL) {}
-
-AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {}
-
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, unsigned Align, bool isVolatile,
@@ -37,8 +32,8 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
   if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
     const AArch64TargetLowering &TLI = *STI.getTargetLowering();
 
-    EVT IntPtr = TLI.getPointerTy();
-    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+    EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
+    Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
     TargetLowering::ArgListTy Args;
     TargetLowering::ArgListEntry Entry;
     Entry.Node = Dst;
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 11932d2b1c22..97421b45b122 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -20,8 +20,6 @@ namespace llvm {
 
 class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit AArch64SelectionDAGInfo(const DataLayout *DL);
-  ~AArch64SelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
                                   SDValue Dst, SDValue Src, SDValue Size,
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 554826b1e08a..486efd6ce3a2 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -49,15 +49,15 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
       HasV8_1aOps(false), HasFPARMv8(false), HasNEON(false), HasCrypto(false),
       HasCRC(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
       IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(),
-      InstrInfo(initializeSubtargetDependencies(FS)),
-      TSInfo(TM.getDataLayout()), TLInfo(TM, *this) {}
+      InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
+      TLInfo(TM, *this) {}
 
 /// ClassifyGlobalReference - Find the target operand flags that describe
 /// how a global value should be referenced for the current subtarget.
 unsigned char
 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
                                         const TargetMachine &TM) const {
-  bool isDecl = GV->isDeclarationForLinker();
+  bool isDef = GV->isStrongDefinitionForLinker();
 
   // MachO large model always goes via a GOT, simply to get a single 8-byte
   // absolute relocation on all global addresses.
@@ -66,8 +66,7 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
 
   // The small code mode's direct accesses use ADRP, which cannot necessarily
   // produce the value 0 (if the code is above 4GB).
-  if (TM.getCodeModel() == CodeModel::Small &&
-      GV->isWeakForLinker() && isDecl) {
+  if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) {
     // In PIC mode use the GOT, but in absolute mode use a constant pool load.
     if (TM.getRelocationModel() == Reloc::Static)
         return AArch64II::MO_CONSTPOOL;
@@ -85,8 +84,7 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
   //     defined could end up in unexpected places. Use a GOT.
   if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
     if (isTargetMachO())
-      return (isDecl || GV->isWeakForLinker()) ? AArch64II::MO_GOT
-                                               : AArch64II::MO_NO_FLAG;
+      return isDef ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
     else
       // No need to go through the GOT for local symbols on ELF.
       return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index fc91c94351cc..e085cca35f1c 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -181,8 +181,8 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  EVT SrcTy = TLI->getValueType(Src);
-  EVT DstTy = TLI->getValueType(Dst);
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return BaseT::getCastInstrCost(Opcode, Dst, Src);
@@ -265,7 +265,7 @@ unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 
   if (Index != -1U) {
     // Legalize the type.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
 
     // This type is legalized to a scalar type.
     if (!LT.second.isVector())
@@ -289,7 +289,7 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost(
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
@@ -364,8 +364,8 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
     };
 
-    EVT SelCondTy = TLI->getValueType(CondTy);
-    EVT SelValTy = TLI->getValueType(ValTy);
+    EVT SelCondTy = TLI->getValueType(DL, CondTy);
+    EVT SelValTy = TLI->getValueType(DL, ValTy);
     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
       int Idx =
           ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
@@ -380,7 +380,7 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                          unsigned Alignment,
                                          unsigned AddressSpace) {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 
   if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
       Src->getVectorElementType()->isIntegerTy(64)) {
@@ -416,7 +416,7 @@ unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
   if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
     Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = TLI->getDataLayout()->getTypeAllocSize(SubVecTy);
+    unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy);
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 4dabdadd8eeb..444d3ccc15e1 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -31,7 +31,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   typedef TargetTransformInfo TTI;
   friend BaseT;
 
-  const AArch64TargetMachine *TM;
   const AArch64Subtarget *ST;
   const AArch64TargetLowering *TLI;
 
@@ -50,30 +49,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
 public:
   explicit AArch64TTIImpl(const AArch64TargetMachine *TM, Function &F)
-      : BaseT(TM), TM(TM), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   AArch64TTIImpl(const AArch64TTIImpl &Arg)
-      : BaseT(static_cast<const BaseT &>(Arg)), TM(Arg.TM), ST(Arg.ST),
-        TLI(Arg.TLI) {}
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
   AArch64TTIImpl(AArch64TTIImpl &&Arg)
-      : BaseT(std::move(static_cast<BaseT &>(Arg))), TM(std::move(Arg.TM)),
-        ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {}
-  AArch64TTIImpl &operator=(const AArch64TTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    TM = RHS.TM;
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  AArch64TTIImpl &operator=(AArch64TTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    TM = std::move(RHS.TM);
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
 
   /// \name Scalar TTI Implementations
   /// @{
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 359c2e734e21..db9fb0e775df 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -228,7 +228,7 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
 }
 
 static MCSymbolizer *
-createAArch64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
+createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
                                 LLVMSymbolLookupCallback SymbolLookUp,
                                 void *DisInfo, MCContext *Ctx,
                                 std::unique_ptr<MCRelocationInfo> &&RelInfo) {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index b5b1d1f9e19c..16d53569b231 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -199,7 +199,7 @@ MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
 MCTargetStreamer *
 createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   const Triple &TT = STI.getTargetTriple();
-  if (TT.getObjectFormat() == Triple::ELF)
+  if (TT.isOSBinFormatELF())
     return new AArch64TargetELFStreamer(S);
   return nullptr;
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 099d1b01c339..9f7bed0d3b12 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -42,16 +42,13 @@ static MCInstrInfo *createAArch64MCInstrInfo() {
 
 static MCSubtargetInfo *
 createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-
   if (CPU.empty())
     CPU = "generic";
 
-  InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createAArch64MCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
+static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitAArch64MCRegisterInfo(X, AArch64::LR);
   return X;
@@ -75,11 +72,11 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT,
+                                                 Reloc::Model RM,
                                                  CodeModel::Model CM,
                                                  CodeGenOpt::Level OL) {
-  Triple TheTriple(TT);
-  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
+  assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) &&
          "Only expect Darwin and ELF targets");
 
   if (CM == CodeModel::Default)
@@ -94,7 +91,7 @@ static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
         "Only small and large code models are allowed on AArch64");
 
   // AArch64 Darwin is always PIC.
-  if (TheTriple.isOSDarwin())
+  if (TT.isOSDarwin())
     RM = Reloc::PIC_;
   // On ELF platforms the default static relocation model has a smart enough
   // linker to cope with referencing external symbols defined in a shared
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 569ad3844b25..ef8ef6268548 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -98,6 +98,16 @@ def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
         "true",
         "Enable SI load/store optimizer pass">;
 
+// Performance debugging feature. Allow using DS instruction immediate
+// offsets even if the base pointer can't be proven to be base. On SI,
+// base pointer values that won't give the same result as a 16-bit add
+// are not safe to fold, but this will override the conservative test
+// for the base pointer.
+def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-folding",
+        "EnableUnsafeDSOffsetFolding",
+        "true",
+        "Force using DS instruction immediate offsets on SI">;
+
 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
         "FlatAddressSpace",
         "true",
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 0b426bc63dd5..ad267d350850 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -22,7 +22,6 @@ using namespace llvm;
 namespace {
 
 class AMDGPUAlwaysInline : public ModulePass {
-
   static char ID;
 
 public:
@@ -36,10 +35,9 @@ public:
 char AMDGPUAlwaysInline::ID = 0;
 
 bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+  std::vector<Function *> FuncsToClone;
 
-  std::vector<Function*> FuncsToClone;
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    Function &F = *I;
+  for (Function &F : M) {
     if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
         !F.hasFnAttribute(Attribute::NoInline))
       FuncsToClone.push_back(&F);
@@ -49,12 +47,11 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
     ValueToValueMapTy VMap;
     Function *NewFunc = CloneFunction(F, VMap, false);
     NewFunc->setLinkage(GlobalValue::InternalLinkage);
-    F->getParent()->getFunctionList().push_back(NewFunc);
+    M.getFunctionList().push_back(NewFunc);
     F->replaceAllUsesWith(NewFunc);
   }
 
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    Function &F = *I;
+  for (Function &F : M) {
     if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) {
       F.addFnAttr(Attribute::AlwaysInline);
     }
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index df4461eac4db..37b77d778d9f 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -110,8 +110,11 @@ private:
                          SDValue &Offset, SDValue &GLC) const;
   SDNode *SelectAddrSpaceCast(SDNode *N);
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                        SDValue &Clamp, SDValue &Omod) const;
+  bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                         SDValue &Clamp, SDValue &Omod) const;
 
   bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
                             SDValue &Omod) const;
@@ -859,7 +862,8 @@ bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
       (OffsetBits == 8 && !isUInt<8>(Offset)))
     return false;
 
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS ||
+      Subtarget->unsafeDSOffsetFoldingEnabled())
     return true;
 
   // On Southern Islands instruction with a negative base value and an offset
@@ -1316,6 +1320,12 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src,
+                                         SDValue &SrcMods) const {
+  bool Res = SelectVOP3Mods(In, Src, SrcMods);
+  return Res && cast<ConstantSDNode>(SrcMods)->isNullValue();
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
                                          SDValue &SrcMods, SDValue &Clamp,
                                          SDValue &Omod) const {
@@ -1327,6 +1337,16 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
   return SelectVOP3Mods(In, Src, SrcMods);
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src,
+                                           SDValue &SrcMods, SDValue &Clamp,
+                                           SDValue &Omod) const {
+  bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod);
+
+  return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() &&
+                cast<ConstantSDNode>(Clamp)->isNullValue() &&
+                cast<ConstantSDNode>(Omod)->isNullValue();
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
                                               SDValue &SrcMods,
                                               SDValue &Omod) const {
@@ -1351,18 +1371,14 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   do {
     IsModified = false;
     // Go over all selected nodes and try to fold them a bit more
-    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-         E = CurDAG->allnodes_end(); I != E; ++I) {
-
-      SDNode *Node = I;
-
-      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+    for (SDNode &Node : CurDAG->allnodes()) {
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
       if (!MachineNode)
         continue;
 
       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
-      if (ResNode != Node) {
-        ReplaceUses(Node, ResNode);
+      if (ResNode != &Node) {
+        ReplaceUses(&Node, ResNode);
         IsModified = true;
       }
     }
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d56838ec2019..3a65f3b56146 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -406,6 +406,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
 
+  setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::SELECT_CC);
@@ -444,7 +445,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
 // Target Information
 //===----------------------------------------------------------------------===//
 
-MVT AMDGPUTargetLowering::getVectorIdxTy() const {
+MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
   return MVT::i32;
 }
 
@@ -545,9 +546,8 @@ bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
 }
 
 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
-  const DataLayout *DL = getDataLayout();
-  unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType());
-  unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType());
+  unsigned SrcSize = Src->getScalarSizeInBits();
+  unsigned DestSize = Dest->getScalarSizeInBits();
 
   return SrcSize == 32 && DestSize == 64;
 }
@@ -697,7 +697,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                        const SDValue &InitPtr,
                                                        SDValue Chain,
                                                        SelectionDAG &DAG) const {
-  const DataLayout *TD = getDataLayout();
+  const DataLayout &TD = DAG.getDataLayout();
   SDLoc DL(InitPtr);
   Type *InitTy = Init->getType();
 
@@ -705,20 +705,20 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     EVT VT = EVT::getEVT(InitTy);
     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
     return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                        TD->getPrefTypeAlignment(InitTy));
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
+                        false, TD.getPrefTypeAlignment(InitTy));
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
     EVT VT = EVT::getEVT(CFP->getType());
     PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
     return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr,
-                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                 TD->getPrefTypeAlignment(CFP->getType()));
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
+                        false, TD.getPrefTypeAlignment(CFP->getType()));
   }
 
   if (StructType *ST = dyn_cast<StructType>(InitTy)) {
-    const StructLayout *SL = TD->getStructLayout(ST);
+    const StructLayout *SL = TD.getStructLayout(ST);
 
     EVT PtrVT = InitPtr.getValueType();
     SmallVector<SDValue, 8> Chains;
@@ -745,7 +745,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     else
       llvm_unreachable("Unexpected type");
 
-    unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
+    unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType());
     SmallVector<SDValue, 8> Chains;
     for (unsigned i = 0; i < NumElements; ++i) {
       SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT);
@@ -762,8 +762,8 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     EVT VT = EVT::getEVT(InitTy);
     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
     return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                        TD->getPrefTypeAlignment(InitTy));
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
+                        false, TD.getPrefTypeAlignment(InitTy));
   }
 
   Init->dump();
@@ -785,7 +785,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
                                                  SDValue Op,
                                                  SelectionDAG &DAG) const {
 
-  const DataLayout *TD = getDataLayout();
+  const DataLayout &DL = DAG.getDataLayout();
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = G->getGlobal();
 
@@ -801,7 +801,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
 
     unsigned Offset;
     if (MFI->LocalMemoryObjects.count(GV) == 0) {
-      uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+      uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
       Offset = MFI->LDSSize;
       MFI->LocalMemoryObjects[GV] = Offset;
       // XXX: Account for alignment?
@@ -811,16 +811,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
     }
 
     return DAG.getConstant(Offset, SDLoc(Op),
-                           getPointerTy(AMDGPUAS::LOCAL_ADDRESS));
+                           getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS));
   }
   case AMDGPUAS::CONSTANT_ADDRESS: {
     MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
     Type *EltType = GV->getType()->getElementType();
-    unsigned Size = TD->getTypeAllocSize(EltType);
-    unsigned Alignment = TD->getPrefTypeAlignment(EltType);
+    unsigned Size = DL.getTypeAllocSize(EltType);
+    unsigned Alignment = DL.getPrefTypeAlignment(EltType);
 
-    MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
-    MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+    MVT PrivPtrVT = getPointerTy(DL, AMDGPUAS::PRIVATE_ADDRESS);
+    MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
 
     int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
     SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
@@ -1653,7 +1653,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   // fb = fabs(fb);
   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT);
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 
   // int cv = fr >= fb;
   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
@@ -1960,7 +1960,8 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
 
   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
@@ -2020,7 +2021,8 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
 
   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
 
@@ -2051,7 +2053,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
   APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
 
   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
@@ -2081,7 +2084,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const
 
   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
 
   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
 
@@ -2100,8 +2104,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const
   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
   const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
   const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
-
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
 
   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
 
@@ -2172,7 +2176,8 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
 
   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
@@ -2411,6 +2416,33 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
                       SN->getBasePtr(), SN->getMemOperand());
 }
 
+SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  // i64 (shl x, 32) -> (build_pair 0, x)
+
+  // Doing this with moves theoretically helps MI optimizations that understand
+  // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as
+  // v_lshl_b64. In the SALU case, I think this is slightly worse since it
+  // doubles the code size and I'm unsure about cycle count.
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS || RHS->getZExtValue() != 32)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Extract low 32-bits.
+  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo);
+}
+
 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   EVT VT = N->getValueType(0);
@@ -2448,17 +2480,24 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   SDLoc DL(N);
 
   switch(N->getOpcode()) {
-    default: break;
-    case ISD::MUL:
-      return performMulCombine(N, DCI);
-    case AMDGPUISD::MUL_I24:
-    case AMDGPUISD::MUL_U24: {
-      SDValue N0 = N->getOperand(0);
-      SDValue N1 = N->getOperand(1);
-      simplifyI24(N0, DCI);
-      simplifyI24(N1, DCI);
-      return SDValue();
-    }
+  default:
+    break;
+  case ISD::SHL: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performShlCombine(N, DCI);
+  }
+  case ISD::MUL:
+    return performMulCombine(N, DCI);
+  case AMDGPUISD::MUL_I24:
+  case AMDGPUISD::MUL_U24: {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    simplifyI24(N0, DCI);
+    simplifyI24(N1, DCI);
+    return SDValue();
+  }
   case ISD::SELECT: {
     SDValue Cond = N->getOperand(0);
     if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
@@ -2644,6 +2683,18 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
   return DAG.getRegister(VirtualRegister, VT);
 }
 
+uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
+    const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
+  uint64_t ArgOffset = MFI->ABIArgOffset;
+  switch (Param) {
+  case GRID_DIM:
+    return ArgOffset;
+  case GRID_OFFSET:
+    return ArgOffset + 4;
+  }
+  llvm_unreachable("unexpected implicit parameter type");
+}
+
 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
 
 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index fbb7d3c88437..478b2035fd75 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -65,6 +65,7 @@ private:
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
 protected:
@@ -123,7 +124,7 @@ public:
 
   bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
-  MVT getVectorIdxTy() const override;
+  MVT getVectorIdxTy(const DataLayout &) const override;
   bool isSelectSupported(SelectSupportKind) const override;
 
   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
@@ -207,6 +208,16 @@ public:
   virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
                                        const TargetRegisterClass *RC,
                                        unsigned Reg, EVT VT) const;
+
+  enum ImplicitParameter {
+    GRID_DIM,
+    GRID_OFFSET
+  };
+
+  /// \brief Helper function that returns the byte offset of the given
+  /// type of implicit parameter.
+  unsigned getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
+                                      const ImplicitParameter Param) const;
 };
 
 namespace AMDGPUISD {
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 0779d1d786b2..bd5abc4f546e 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -69,6 +69,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
       FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
       CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
       EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
+      EnableUnsafeDSOffsetFolding(false),
       WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
       EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
       GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 30f50eb1d2f3..90831bfb4458 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -76,6 +76,7 @@ private:
   bool EnablePromoteAlloca;
   bool EnableIfCvt;
   bool EnableLoadStoreOpt;
+  bool EnableUnsafeDSOffsetFolding;
   unsigned WavefrontSize;
   bool CFALUBug;
   int LocalMemorySize;
@@ -222,6 +223,10 @@ public:
     return EnableLoadStoreOpt;
   }
 
+  bool unsafeDSOffsetFoldingEnabled() const {
+    return EnableUnsafeDSOffsetFolding;
+  }
+
   unsigned getWavefrontSize() const {
     return WavefrontSize;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index a9a911a8efed..2297b52b423c 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -156,8 +156,10 @@ public:
 } // End of anonymous namespace
 
 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis(
-      [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); });
+  return TargetIRAnalysis([this](Function &F) {
+    return TargetTransformInfo(
+        AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
+  });
 }
 
 void AMDGPUPassConfig::addIRPasses() {
@@ -269,6 +271,7 @@ void GCNPassConfig::addPreRegAlloc() {
     // also need extra copies to the address operand to be eliminated.
     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
     insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
+    insertPass(&MachineSchedulerID, &RegisterCoalescerID);
   }
   addPass(createSIShrinkInstructionsPass(), false);
   addPass(createSIFixSGPRLiveRangesPass(), false);
@@ -280,10 +283,10 @@ void GCNPassConfig::addPostRegAlloc() {
 }
 
 void GCNPassConfig::addPreSched2() {
-  addPass(createSIInsertWaits(*TM), false);
 }
 
 void GCNPassConfig::addPreEmitPass() {
+  addPass(createSIInsertWaits(*TM), false);
   addPass(createSILowerControlFlowPass(*TM), false);
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 791c84e6f28b..dee0a69d1e68 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -37,8 +37,9 @@ class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {
   const AMDGPUTargetLowering *getTLI() const { return TLI; }
 
 public:
-  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM)
-      : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
+  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const DataLayout &DL)
+      : BaseT(TM, DL), ST(TM->getSubtargetImpl()),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg)
@@ -46,18 +47,6 @@ public:
   AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   bool hasBranchDivergence() { return true; }
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 7172e4bb9335..c709741f3777 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -44,7 +44,7 @@ static MCInstrInfo *createAMDGPUMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitAMDGPUMCRegisterInfo(X, 0);
   return X;
@@ -52,14 +52,13 @@ static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo * X = new MCSubtargetInfo();
-  InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                               CodeModel::Model CM,
-                                               CodeGenOpt::Level OL) {
+static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(const Triple &TT,
+                                                Reloc::Model RM,
+                                                CodeModel::Model CM,
+                                                CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
   X->initMCCodeGenInfo(RM, CM, OL);
   return X;
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 8357b6d9d0ed..4e4d554f0ee7 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -815,8 +815,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case Intrinsic::r600_read_local_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
-    case Intrinsic::AMDGPU_read_workdim:
-      return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
+    case Intrinsic::AMDGPU_read_workdim: {
+      uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
+      return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
+    }
 
     case Intrinsic::r600_read_tgid_x:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
@@ -897,8 +899,9 @@ SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 
   for (unsigned i = 0, e = VecVT.getVectorNumElements();
                                                            i != e; ++i) {
-    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
-                               DAG.getConstant(i, DL, getVectorIdxTy())));
+    Args.push_back(DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
+        DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
   }
 
   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
@@ -1459,22 +1462,17 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   SDValue Ptr = Op.getOperand(1);
   SDValue LoweredLoad;
 
-  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
-  if (Ret.getNode()) {
-    SDValue Ops[2] = {
-      Ret,
-      Chain
-    };
-    return DAG.getMergeValues(Ops, DL);
-  }
+  if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG))
+    return Ret;
 
   // Lower loads constant address space global variable loads
   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
       isa<GlobalVariable>(GetUnderlyingObject(
-          LoadNode->getMemOperand()->getValue(), *getDataLayout()))) {
+          LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) {
 
-    SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
-        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
+    SDValue Ptr = DAG.getZExtOrTrunc(
+        LoadNode->getBasePtr(), DL,
+        getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS));
     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
         DAG.getConstant(2, DL, MVT::i32));
     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
@@ -1702,7 +1700,8 @@ SDValue R600TargetLowering::LowerFormalArguments(
   return Chain;
 }
 
-EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                           EVT VT) const {
    if (!VT.isVector())
      return MVT::i32;
    return VT.changeVectorElementTypeToInteger();
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index c06d3c4fd309..4dbac97af2a1 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -38,7 +38,9 @@ public:
                               const SmallVectorImpl<ISD::InputArg> &Ins,
                               SDLoc DL, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &InVals) const override;
-  EVT getSetCCResultType(LLVMContext &, EVT VT) const override;
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                         EVT VT) const override;
+
 private:
   unsigned Gen;
   /// Each OpenCL kernel has nine implicit parameters that are stored in the
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index d14e37a64612..c2887255cc11 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -126,11 +126,42 @@ static bool updateOperand(FoldCandidate &Fold,
   return false;
 }
 
+static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
+                              const MachineInstr *MI) {
+  for (auto Candidate : FoldList) {
+    if (Candidate.UseMI == MI)
+      return true;
+  }
+  return false;
+}
+
 static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
                              MachineInstr *MI, unsigned OpNo,
                              MachineOperand *OpToFold,
                              const SIInstrInfo *TII) {
   if (!TII->isOperandLegal(MI, OpNo, OpToFold)) {
+
+    // Special case for v_mac_f32_e64 if we are trying to fold into src2
+    unsigned Opc = MI->getOpcode();
+    if (Opc == AMDGPU::V_MAC_F32_e64 &&
+        (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
+      // Check if changing this to a v_mad_f32 instruction will allow us to
+      // fold the operand.
+      MI->setDesc(TII->get(AMDGPU::V_MAD_F32));
+      bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
+      if (FoldAsMAD) {
+        MI->untieRegOperand(OpNo);
+        return true;
+      }
+      MI->setDesc(TII->get(Opc));
+    }
+
+    // If we are already folding into another operand of MI, then
+    // we can't commute the instruction, otherwise we risk making the
+    // other fold illegal.
+    if (isUseMIInFoldList(FoldList, MI))
+      return false;
+
     // Operand is not legal, so try to commute the instruction to
     // see if this makes it possible to fold.
     unsigned CommuteIdx0;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index ead1a3743473..dd818a9ba746 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -254,8 +254,9 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
   return false;
 }
 
-bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                             Type *Ty, unsigned AS) const {
+bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                             const AddrMode &AM, Type *Ty,
+                                             unsigned AS) const {
   // No global is ever allowed as a base.
   if (AM.BaseGV)
     return false;
@@ -416,7 +417,7 @@ static EVT toIntegerVT(EVT VT) {
 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          SDLoc SL, SDValue Chain,
                                          unsigned Offset, bool Signed) const {
-  const DataLayout *DL = getDataLayout();
+  const DataLayout &DL = DAG.getDataLayout();
   MachineFunction &MF = DAG.getMachineFunction();
   const SIRegisterInfo *TRI =
       static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
@@ -425,16 +426,16 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
 
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-  MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
                                        MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
   SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
                             DAG.getConstant(Offset, SL, PtrVT));
-  SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
+  SDValue PtrOffset = DAG.getUNDEF(PtrVT);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
-  unsigned Align = DL->getABITypeAlignment(Ty);
+  unsigned Align = DL.getABITypeAlignment(Ty);
 
   if (VT != MemVT && VT.isFloatingPoint()) {
     // Do an integer load and convert.
@@ -451,7 +452,12 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                true, // isNonTemporal
                                true, // isInvariant
                                Align); // Alignment
-    return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load);
+    SDValue Ops[] = {
+      DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load),
+      Load.getValue(1)
+    };
+
+    return DAG.getMergeValues(Ops, SL);
   }
 
   ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
@@ -569,6 +575,8 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   AnalyzeFormalArguments(CCInfo, Splits);
 
+  SmallVector<SDValue, 16> Chains;
+
   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
 
     const ISD::InputArg &Arg = Ins[i];
@@ -587,8 +595,9 @@ SDValue SITargetLowering::LowerFormalArguments(
                               VA.getLocMemOffset();
       // The first 36 bytes of the input buffer contains information about
       // thread group and global sizes.
-      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
+      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, Chain,
                                    Offset, Ins[i].Flags.isSExt());
+      Chains.push_back(Arg.getValue(1));
 
       const PointerType *ParamTy =
         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
@@ -614,7 +623,8 @@ SDValue SITargetLowering::LowerFormalArguments(
       Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
                                      &AMDGPU::SReg_64RegClass);
       Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
-      InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+      SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      InVals.push_back(Copy);
       continue;
     }
 
@@ -634,7 +644,9 @@ SDValue SITargetLowering::LowerFormalArguments(
       for (unsigned j = 1; j != NumElements; ++j) {
         Reg = ArgLocs[ArgIdx++].getLocReg();
         Reg = MF.addLiveIn(Reg, RC);
-        Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+
+        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+        Regs.push_back(Copy);
       }
 
       // Fill up the missing vector elements
@@ -653,7 +665,11 @@ SDValue SITargetLowering::LowerFormalArguments(
         AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
     Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
   }
-  return Chain;
+
+  if (Chains.empty())
+    return Chain;
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
@@ -695,14 +711,15 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
   return true;
 }
 
-EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const {
+EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
+                                         EVT VT) const {
   if (!VT.isVector()) {
     return MVT::i1;
   }
   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
 }
 
-MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
+MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const {
   return MVT::i32;
 }
 
@@ -888,7 +905,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
 
   SDLoc DL(GSD);
   const GlobalValue *GV = GSD->getGlobal();
-  MVT PtrVT = getPointerTy(GSD->getAddressSpace());
+  MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
 
   SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
   SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
@@ -926,6 +943,7 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  auto MFI = MF.getInfo<SIMachineFunctionInfo>();
   const SIRegisterInfo *TRI =
       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
 
@@ -964,8 +982,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   case Intrinsic::AMDGPU_read_workdim:
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset,
-                          false);
+                          getImplicitParameterOffset(MFI, GRID_DIM), false);
 
   case Intrinsic::r600_read_tgid_x:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
@@ -1213,7 +1230,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
 
   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
 
   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
 
@@ -1411,7 +1429,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
     unsigned AS = Load->getAddressSpace();
     unsigned Align = Load->getAlignment();
     Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
 
     // Don't try to replace the load if we have to expand it due to alignment
     // problems. Otherwise we will end up scalarizing the load, and trying to
@@ -2212,9 +2230,8 @@ SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
 
 std::pair<unsigned, const TargetRegisterClass *>
 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                               const std::string &Constraint_,
+                                               StringRef Constraint,
                                                MVT VT) const {
-  StringRef Constraint(Constraint_);
   if (Constraint == "r") {
     switch(VT.SimpleTy) {
       default: llvm_unreachable("Unhandled type for 'r' inline asm constraint");
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index a956b013bdb1..635b4edc89de 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -62,8 +62,8 @@ public:
   bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
                           EVT /*VT*/) const override;
 
-  bool isLegalAddressingMode(const AddrMode &AM,
-                             Type *Ty, unsigned AS) const override;
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
 
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
@@ -90,8 +90,9 @@ public:
   MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
                                       MachineBasicBlock * BB) const override;
   bool enableAggressiveFMAFusion(EVT VT) const override;
-  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
-  MVT getScalarShiftAmountTy(EVT VT) const override;
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
@@ -114,9 +115,9 @@ public:
                                   SDLoc DL,
                                   SDValue Ptr) const;
 
-  std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(
-                                   const TargetRegisterInfo *TRI,
-                                   const std::string &Constraint, MVT VT) const override;
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;
 };
 
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index eb96bd0227b2..18910615bebe 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -227,9 +227,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
 
     uint8_t Offset0 = Offset0Imm->getImm();
     uint8_t Offset1 = Offset1Imm->getImm();
-    assert(Offset1 > Offset0);
 
-    if (Offset1 - Offset0 == 1) {
+    if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
       // Each of these offsets is in element sized units, so we need to convert
       // to bytes of the individual reads.
 
@@ -924,7 +923,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
     return false;
 
   unsigned Opc = UseMI->getOpcode();
-  if (Opc == AMDGPU::V_MAD_F32) {
+  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) {
     // Don't fold if we are using source modifiers. The new VOP2 instructions
     // don't have them.
     if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) ||
@@ -963,9 +962,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
       // instead of having to modify in place.
 
       // Remove these first since they are at the end.
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
                                                       AMDGPU::OpName::omod));
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
                                                       AMDGPU::OpName::clamp));
 
       unsigned Src1Reg = Src1->getReg();
@@ -980,6 +979,14 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
       Src1->setSubReg(Src2SubReg);
       Src1->setIsKill(Src2->isKill());
 
+      if (Opc == AMDGPU::V_MAC_F32_e64) {
+        UseMI->untieRegOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
+      }
+
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
+                                                      AMDGPU::OpName::src2));
+      // ChangingToImmediate adds Src2 back to the instruction.
       Src2->ChangeToImmediate(Imm);
 
       removeModOperands(*UseMI);
@@ -1010,11 +1017,17 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
       // instead of having to modify in place.
 
       // Remove these first since they are at the end.
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
                                                       AMDGPU::OpName::omod));
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
                                                       AMDGPU::OpName::clamp));
 
+      if (Opc == AMDGPU::V_MAC_F32_e64) {
+        UseMI->untieRegOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
+      }
+
+      // ChangingToImmediate adds Src2 back to the instruction.
       Src2->ChangeToImmediate(Imm);
 
       // These come before src2.
@@ -1126,6 +1139,38 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
   return false;
 }
 
+MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
+                                                MachineBasicBlock::iterator &MI,
+                                                LiveVariables *LV) const {
+
+  switch (MI->getOpcode()) {
+    default: return nullptr;
+    case AMDGPU::V_MAC_F32_e64: break;
+    case AMDGPU::V_MAC_F32_e32: {
+      const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0);
+      if (Src0->isImm() && !isInlineConstant(*Src0, 4))
+        return nullptr;
+      break;
+    }
+  }
+
+  const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst);
+  const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0);
+  const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1);
+  const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2);
+
+  return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32))
+                 .addOperand(*Dst)
+                 .addImm(0) // Src0 mods
+                 .addOperand(*Src0)
+                 .addImm(0) // Src1 mods
+                 .addOperand(*Src1)
+                 .addImm(0) // Src mods
+                 .addOperand(*Src2)
+                 .addImm(0)  // clamp
+                 .addImm(0); // omod
+}
+
 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
   int64_t SVal = Imm.getSExtValue();
   if (SVal >= -16 && SVal <= 64)
@@ -1625,7 +1670,10 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
 
   if (MO->isReg()) {
     assert(DefinedRC);
-    const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg());
+    const TargetRegisterClass *RC =
+        TargetRegisterInfo::isVirtualRegister(MO->getReg()) ?
+            MRI.getRegClass(MO->getReg()) :
+            RI.getPhysRegClass(MO->getReg());
 
     // In order to be legal, the common sub-class must be equal to the
     // class of the current operand.  For example:
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 0382272068d2..015ea12d4598 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -144,6 +144,10 @@ public:
 
   unsigned getMachineCSELookAheadLimit() const override { return 500; }
 
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB,
+                                      MachineBasicBlock::iterator &MI,
+                                      LiveVariables *LV) const override;
+
   bool isSALU(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SALU;
   }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index fcb58d5da3b0..b39a78714640 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -529,9 +529,11 @@ def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
 
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
+def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
 def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
 def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
 def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
+def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">;
 
 //===----------------------------------------------------------------------===//
 // SI assembler operands
@@ -1113,6 +1115,13 @@ def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> {
   field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2);
   field string Asm = "$dst, $src0, $vsrc1, $src2";
 }
+def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> {
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
+  let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
+                             HasModifiers>.ret;
+  let Asm32 = getAsm32<2>.ret;
+  let Asm64 = getAsm64<2, HasModifiers>.ret;
+}
 def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
 def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
 def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 8c8d836776db..1ee63c675822 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1488,7 +1488,10 @@ defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>;
 defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>;
 defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>;
 
-defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>;
+let Constraints = "$dst = $src2", DisableEncoding="$src2",
+    isConvertibleToThreeAddress = 1 in {
+defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_MAC>;
+}
 } // End isCommutable = 1
 
 defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">;
@@ -2206,6 +2209,15 @@ def : Pat <
   (V_CNDMASK_B32_e64 $src2, $src1, $src0)
 >;
 
+// Pattern for V_MAC_F32
+def : Pat <
+  (fmad  (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+         (VOP3NoMods f32:$src1, i32:$src1_modifiers),
+         (VOP3NoMods f32:$src2, i32:$src2_modifiers)),
+  (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+                 $src2_modifiers, $src2, $clamp, $omod)
+>;
+
 /********** ======================= **********/
 /********** Image sampling patterns **********/
 /********** ======================= **********/
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 9b1d256dc5a8..1bdb1f0ee9f9 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -214,12 +214,11 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   // cases, like vectors of pointers.
   const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
 
-  unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
-  unsigned DestReg1
-    = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg();
+  const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst);
+  const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst);
 
   unsigned Offset0
-          = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
+    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
   unsigned Offset1
     = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
 
@@ -258,20 +257,43 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
 
   unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
   unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
-  updateRegDefsUses(DestReg0, DestReg, SubRegIdx0);
-  updateRegDefsUses(DestReg1, DestReg, SubRegIdx1);
 
-  LIS->RemoveMachineInstrFromMaps(I);
-  // Replacing Paired in the maps with Read2 allows us to avoid updating the
-  // live range for the m0 register.
-  LIS->ReplaceMachineInstrInMaps(Paired, Read2);
+  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+
+  // Copy to the old destination registers.
+  MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc)
+    .addOperand(*Dest0) // Copy to same destination including flags and sub reg.
+    .addReg(DestReg, 0, SubRegIdx0);
+  MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc)
+    .addOperand(*Dest1)
+    .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+  LIS->InsertMachineInstrInMaps(Read2);
+
+  // repairLiveintervalsInRange() doesn't handle physical register, so we have
+  // to update the M0 range manually.
+  SlotIndex PairedIndex = LIS->getInstructionIndex(Paired);
+  LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
+  LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
+  bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
+
+  // The new write to the original destination register is now the copy. Steal
+  // the old SlotIndex.
+  LIS->ReplaceMachineInstrInMaps(I, Copy0);
+  LIS->ReplaceMachineInstrInMaps(Paired, Copy1);
+
   I->eraseFromParent();
   Paired->eraseFromParent();
 
   LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
   LIS->shrinkToUses(&AddrRegLI);
 
-  LIS->getInterval(DestReg); // Create new LI
+  LIS->createAndComputeVirtRegInterval(DestReg);
+
+  if (UpdateM0Range) {
+    SlotIndex Read2Index = LIS->getInstructionIndex(Read2);
+    M0Segment->end = Read2Index.getRegSlot();
+  }
 
   DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
   return Read2.getInstr();
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 587ea63d6796..d23b92edef33 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -53,7 +53,6 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
   if (!LaneVGPRs.count(LaneVGPRIdx)) {
     unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
     LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
-    MRI.setPhysRegUsed(LaneVGPR);
 
     // Add this register as live-in to all blocks to avoid machine verifer
     // complaining about use of an undefined physical register.
diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
index 0a7f684552f0..b086d2ed6652 100644
--- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
@@ -91,7 +91,6 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
 
   if (ScratchOffsetReg != AMDGPU::NoRegister) {
     // Found an SGPR to use
-    MRI.setPhysRegUsed(ScratchOffsetReg);
     BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
             .addReg(ScratchOffsetPreloadReg);
   } else {
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index db2ff0b1f952..ce4acafac9fa 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -499,7 +499,7 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
 
   for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
        I != E; ++I) {
-    if (!MRI.isPhysRegUsed(*I))
+    if (MRI.reg_nodbg_empty(*I))
       return *I;
   }
   return AMDGPU::NoRegister;
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 51e72cdb5f9e..5d00bdd6a9bb 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -94,8 +94,20 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
   // is vcc.  We should handle this the same way we handle vopc, by addding
   // a register allocation hint pre-regalloc and then do the shrining
   // post-regalloc.
-  if (Src2)
-    return false;
+  if (Src2) {
+    switch (MI.getOpcode()) {
+      default: return false;
+
+      case AMDGPU::V_MAC_F32_e64:
+        if (!isVGPR(Src2, TRI, MRI) ||
+            TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
+          return false;
+        break;
+
+      case AMDGPU::V_CNDMASK_B32_e64:
+        break;
+    }
+  }
 
   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
   const MachineOperand *Src1Mod =
@@ -149,7 +161,7 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
     return;
 
   // Try to fold Src0
-  if (Src0.isReg()) {
+  if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
     unsigned Reg = Src0.getReg();
     MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
     if (Def && Def->isMoveImmediate()) {
@@ -243,6 +255,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           continue;
       }
 
+      if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
+        // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
+        // instructions.
+        const MachineOperand *Src2 =
+            TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+        if (!Src2->isReg())
+          continue;
+        unsigned SReg = Src2->getReg();
+        if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+          MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
+          continue;
+        }
+        if (SReg != AMDGPU::VCC)
+          continue;
+      }
+
       // We can shrink this instruction
       DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';);
 
@@ -259,6 +287,11 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       if (Src1)
         Inst32.addOperand(*Src1);
 
+      const MachineOperand *Src2 =
+          TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+      if (Src2)
+        Inst32.addOperand(*Src2);
+
       ++NumInstructionsShrunk;
       MI.eraseFromParent();
 
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 96b4742da2bb..ef609a66d032 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -150,6 +150,10 @@ def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
 def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
                                        "NaCl trap">;
 
+def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true",
+                                        "Generate calls via indirect call "
+                                        "instructions">;
+
 // ARM ISAs.
 def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
                                    "Support ARM v4T instructions">;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index b1a11d626bda..9f43e732bd73 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1230,8 +1230,7 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   Reloc::Model RM = MF.getTarget().getRelocationModel();
 
   if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
-    assert(getSubtarget().getTargetTriple().getObjectFormat() ==
-           Triple::MachO &&
+    assert(getSubtarget().getTargetTriple().isOSBinFormatMachO() &&
            "LOAD_STACK_GUARD currently supported only for MachO.");
     expandLoadStackGuard(MI, RM);
     MI->getParent()->erase(MI);
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 3f79a9b53d70..e7d5be7753e4 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -127,7 +127,7 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
 BitVector ARMBaseRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
-  const TargetFrameLowering *TFI = STI.getFrameLowering();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
 
   // FIXME: avoid re-calculating this every time.
   BitVector Reserved(getNumRegs());
@@ -194,7 +194,7 @@ unsigned
 ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                          MachineFunction &MF) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
-  const TargetFrameLowering *TFI = STI.getFrameLowering();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
 
   switch (RC->getID()) {
   default:
@@ -302,7 +302,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
 bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
 
   // When outgoing call frames are so large that we adjust the stack pointer
   // around the call, we can no longer use the stack pointer to reach the
@@ -333,6 +333,7 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
 bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   const MachineRegisterInfo *MRI = &MF.getRegInfo();
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
   // We can't realign the stack if:
   // 1. Dynamic stack realignment is explicitly disabled,
   // 2. This is a Thumb1 function (it's not useful, so we don't bother), or
@@ -347,7 +348,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
     return false;
   // We may also need a base pointer if there are dynamic allocas or stack
   // pointer adjustments around calls.
-  if (MF.getSubtarget().getFrameLowering()->hasReservedCallFrame(MF))
+  if (TFI->hasReservedCallFrame(MF))
     return true;
   // A base pointer is required and allowed.  Check that it isn't too late to
   // reserve it.
@@ -357,9 +358,9 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
 bool ARMBaseRegisterInfo::
 needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
   const Function *F = MF.getFunction();
-  unsigned StackAlign =
-      MF.getSubtarget().getFrameLowering()->getStackAlignment();
+  unsigned StackAlign = TFI->getStackAlignment();
   bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
                               F->hasFnAttribute(Attribute::StackAlignment));
 
@@ -378,7 +379,7 @@ cannotEliminateFrame(const MachineFunction &MF) const {
 unsigned
 ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
-  const TargetFrameLowering *TFI = STI.getFrameLowering();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
 
   if (TFI->hasFP(MF))
     return getFramePointerReg(STI);
@@ -517,7 +518,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   // Note that the incoming offset is based on the SP value at function entry,
   // so it'll be negative.
   MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
@@ -694,8 +695,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineFunction &MF = *MBB.getParent();
   const ARMBaseInstrInfo &TII =
       *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const ARMFrameLowering *TFI = static_cast<const ARMFrameLowering *>(
-      MF.getSubtarget().getFrameLowering());
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   assert(!AFI->isThumb1OnlyFunction() &&
          "This eliminateFrameIndex does not support Thumb1!");
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 7dd21ecbe91b..27cf06b995a0 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -142,6 +142,9 @@ def CC_ARM_AAPCS : CallingConv<[
   // Handles byval parameters.
   CCIfByVal<CCPassByVal<4, 4>>,
 
+  // The 'nest' parameter, if any, is passed in R12.
+  CCIfNest<CCAssignToReg<[R12]>>,
+
   // Handle all vector types as either f64 or v2f64.
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 4175b4af86e6..fdd0763ea608 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -49,8 +49,6 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
-extern cl::opt<bool> EnableARMLongCalls;
-
 namespace {
 
   // All possible address modes, plus some.
@@ -685,7 +683,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
 }
 
 unsigned ARMFastISel::fastMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
 
   // Only handle simple types.
   if (!CEVT.isSimple()) return 0;
@@ -732,7 +730,7 @@ unsigned ARMFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
 }
 
 bool ARMFastISel::isTypeLegal(Type *Ty, MVT &VT) {
-  EVT evt = TLI.getValueType(Ty, true);
+  EVT evt = TLI.getValueType(DL, Ty, true);
 
   // Only handle simple types.
   if (evt == MVT::Other || !evt.isSimple()) return false;
@@ -786,12 +784,13 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
       return ARMComputeAddress(U->getOperand(0), Addr);
     case Instruction::IntToPtr:
       // Look past no-op inttoptrs.
-      if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+          TLI.getPointerTy(DL))
         return ARMComputeAddress(U->getOperand(0), Addr);
       break;
     case Instruction::PtrToInt:
       // Look past no-op ptrtoints.
-      if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
         return ARMComputeAddress(U->getOperand(0), Addr);
       break;
     case Instruction::GetElementPtr: {
@@ -1365,7 +1364,7 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
 bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
                              bool isZExt) {
   Type *Ty = Src1Value->getType();
-  EVT SrcEVT = TLI.getValueType(Ty, true);
+  EVT SrcEVT = TLI.getValueType(DL, Ty, true);
   if (!SrcEVT.isSimple()) return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
 
@@ -1557,7 +1556,7 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
     return false;
 
   Value *Src = I->getOperand(0);
-  EVT SrcEVT = TLI.getValueType(Src->getType(), true);
+  EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true);
   if (!SrcEVT.isSimple())
     return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
@@ -1750,7 +1749,7 @@ bool ARMFastISel::SelectRem(const Instruction *I, bool isSigned) {
 }
 
 bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
-  EVT DestVT  = TLI.getValueType(I->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   // We can get here in the case when we have a binary operation on a non-legal
   // type and the target independent selector doesn't know how to handle it.
@@ -1790,7 +1789,7 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
 }
 
 bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
-  EVT FPVT = TLI.getValueType(I->getType(), true);
+  EVT FPVT = TLI.getValueType(DL, I->getType(), true);
   if (!FPVT.isSimple()) return false;
   MVT VT = FPVT.getSimpleVT();
 
@@ -2095,7 +2094,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   CallingConv::ID CC = F.getCallingConv();
   if (Ret->getNumOperands() > 0) {
     SmallVector<ISD::OutputArg, 4> Outs;
-    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
@@ -2122,7 +2121,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
       return false;
 
     unsigned SrcReg = Reg + VA.getValNo();
-    EVT RVEVT = TLI.getValueType(RV->getType());
+    EVT RVEVT = TLI.getValueType(DL, RV->getType());
     if (!RVEVT.isSimple()) return false;
     MVT RVVT = RVEVT.getSimpleVT();
     MVT DestVT = VA.getValVT();
@@ -2173,7 +2172,7 @@ unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
 unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
   // Manually compute the global's type to avoid building it when unnecessary.
   Type *GVTy = Type::getInt32PtrTy(*Context, /*AS=*/0);
-  EVT LCREVT = TLI.getValueType(GVTy);
+  EVT LCREVT = TLI.getValueType(DL, GVTy);
   if (!LCREVT.isSimple()) return 0;
 
   GlobalValue *GV = new GlobalVariable(M, Type::getInt32Ty(*Context), false,
@@ -2246,19 +2245,19 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
     return false;
 
   unsigned CalleeReg = 0;
-  if (EnableARMLongCalls) {
+  if (Subtarget->genLongCalls()) {
     CalleeReg = getLibcallReg(TLI.getLibcallName(Call));
     if (CalleeReg == 0) return false;
   }
 
   // Issue the call.
-  unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls);
+  unsigned CallOpc = ARMSelectCallOp(Subtarget->genLongCalls());
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DbgLoc, TII.get(CallOpc));
   // BL / BLX don't take a predicate, but tBL / tBLX do.
   if (isThumb2)
     AddDefaultPred(MIB);
-  if (EnableARMLongCalls)
+  if (Subtarget->genLongCalls())
     MIB.addReg(CalleeReg);
   else
     MIB.addExternalSymbol(TLI.getLibcallName(Call));
@@ -2380,7 +2379,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
 
   bool UseReg = false;
   const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
-  if (!GV || EnableARMLongCalls) UseReg = true;
+  if (!GV || Subtarget->genLongCalls()) UseReg = true;
 
   unsigned CalleeReg = 0;
   if (UseReg) {
@@ -2576,8 +2575,8 @@ bool ARMFastISel::SelectTrunc(const Instruction *I) {
   Value *Op = I->getOperand(0);
 
   EVT SrcVT, DestVT;
-  SrcVT = TLI.getValueType(Op->getType(), true);
-  DestVT = TLI.getValueType(I->getType(), true);
+  SrcVT = TLI.getValueType(DL, Op->getType(), true);
+  DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
     return false;
@@ -2742,8 +2741,8 @@ bool ARMFastISel::SelectIntExt(const Instruction *I) {
   if (!SrcReg) return false;
 
   EVT SrcEVT, DestEVT;
-  SrcEVT = TLI.getValueType(SrcTy, true);
-  DestEVT = TLI.getValueType(DestTy, true);
+  SrcEVT = TLI.getValueType(DL, SrcTy, true);
+  DestEVT = TLI.getValueType(DL, DestTy, true);
   if (!SrcEVT.isSimple()) return false;
   if (!DestEVT.isSimple()) return false;
 
@@ -2763,7 +2762,7 @@ bool ARMFastISel::SelectShift(const Instruction *I,
     return false;
 
   // Only handle i32 now.
-  EVT DestVT = TLI.getValueType(I->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
   if (DestVT != MVT::i32)
     return false;
 
@@ -3026,7 +3025,7 @@ bool ARMFastISel::fastLowerArguments() {
     if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
       return false;
 
-    EVT ArgVT = TLI.getValueType(ArgTy);
+    EVT ArgVT = TLI.getValueType(DL, ArgTy);
     if (!ArgVT.isSimple()) return false;
     switch (ArgVT.getSimpleVT().SimpleTy) {
     case MVT::i8:
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index a52e49780e27..6744000afe2b 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -800,7 +800,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
           // This is bad, if an interrupt is taken after the mov, sp is in an
           // inconsistent state.
           // Use the first callee-saved register as a scratch register.
-          assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) &&
+          assert(!MFI->getPristineRegs(MF).test(ARM::R4) &&
                  "No scratch register to restore SP from FP!");
           emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
                                  ARMCC::AL, 0, TII);
@@ -1470,7 +1470,8 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
 // callee-saved vector registers after realigning the stack. The vst1 and vld1
 // instructions take alignment hints that can improve performance.
 //
-static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) {
+static void
+checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) {
   MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(0);
   if (!SpillAlignedNEONRegs)
     return;
@@ -1497,10 +1498,9 @@ static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) {
   // callee-saved registers in order, but it can happen that there are holes in
   // the range.  Registers above the hole will be spilled to the standard DPRCS
   // area.
-  MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned NumSpills = 0;
   for (; NumSpills < 8; ++NumSpills)
-    if (!MRI.isPhysRegUsed(ARM::D8 + NumSpills))
+    if (!SavedRegs.test(ARM::D8 + NumSpills))
       break;
 
   // Don't do this for just one d-register. It's not worth it.
@@ -1511,12 +1511,13 @@ static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) {
   MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(NumSpills);
 
   // A scratch register is required for the vst1 / vld1 instructions.
-  MF.getRegInfo().setPhysRegUsed(ARM::R4);
+  SavedRegs.set(ARM::R4);
 }
 
-void
-ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                       RegScavenger *RS) const {
+void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   // This tells PEI to spill the FP as if it is any other callee-save register
   // to take advantage the eliminateFrameIndex machinery. This also ensures it
   // is spilled in the order specified by getCalleeSavedRegs() to make it easier
@@ -1543,12 +1544,12 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // FIXME: It will be better just to find spare register here.
   if (AFI->isThumb2Function() &&
       (MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)))
-    MRI.setPhysRegUsed(ARM::R4);
+    SavedRegs.set(ARM::R4);
 
   if (AFI->isThumb1OnlyFunction()) {
     // Spill LR if Thumb1 function uses variable length argument lists.
     if (AFI->getArgRegsSaveSize() > 0)
-      MRI.setPhysRegUsed(ARM::LR);
+      SavedRegs.set(ARM::LR);
 
     // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know
     // for sure what the stack size will be, but for this, an estimate is good
@@ -1558,23 +1559,23 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     // FIXME: It will be better just to find spare register here.
     unsigned StackSize = MFI->estimateStackSize(MF);
     if (MFI->hasVarSizedObjects() || StackSize > 508)
-      MRI.setPhysRegUsed(ARM::R4);
+      SavedRegs.set(ARM::R4);
   }
 
   // See if we can spill vector registers to aligned stack.
-  checkNumAlignedDPRCS2Regs(MF);
+  checkNumAlignedDPRCS2Regs(MF, SavedRegs);
 
   // Spill the BasePtr if it's used.
   if (RegInfo->hasBasePointer(MF))
-    MRI.setPhysRegUsed(RegInfo->getBaseRegister());
+    SavedRegs.set(RegInfo->getBaseRegister());
 
   // Don't spill FP if the frame can be eliminated. This is determined
-  // by scanning the callee-save registers to see if any is used.
+  // by scanning the callee-save registers to see if any is modified.
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     bool Spilled = false;
-    if (MRI.isPhysRegUsed(Reg)) {
+    if (SavedRegs.test(Reg)) {
       Spilled = true;
       CanEliminateFrame = false;
     }
@@ -1668,7 +1669,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
     // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
     if (!LRSpilled && CS1Spilled) {
-      MRI.setPhysRegUsed(ARM::LR);
+      SavedRegs.set(ARM::LR);
       NumGPRSpills++;
       SmallVectorImpl<unsigned>::iterator LRPos;
       LRPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(),
@@ -1681,7 +1682,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     }
 
     if (hasFP(MF)) {
-      MRI.setPhysRegUsed(FramePtr);
+      SavedRegs.set(FramePtr);
       auto FPPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(),
                              FramePtr);
       if (FPPos != UnspilledCS1GPRs.end())
@@ -1700,7 +1701,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
           // Don't spill high register if the function is thumb
           if (!AFI->isThumbFunction() ||
               isARMLowRegister(Reg) || Reg == ARM::LR) {
-            MRI.setPhysRegUsed(Reg);
+            SavedRegs.set(Reg);
             if (!MRI.isReserved(Reg))
               ExtraCSSpill = true;
             break;
@@ -1708,7 +1709,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
         }
       } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) {
         unsigned Reg = UnspilledCS2GPRs.front();
-        MRI.setPhysRegUsed(Reg);
+        SavedRegs.set(Reg);
         if (!MRI.isReserved(Reg))
           ExtraCSSpill = true;
       }
@@ -1747,7 +1748,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       }
       if (Extras.size() && NumExtras == 0) {
         for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
-          MRI.setPhysRegUsed(Extras[i]);
+          SavedRegs.set(Extras[i]);
         }
       } else if (!AFI->isThumb1OnlyFunction()) {
         // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
@@ -1761,7 +1762,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   }
 
   if (ForceLRSpill) {
-    MRI.setPhysRegUsed(ARM::LR);
+    SavedRegs.set(ARM::LR);
     AFI->setLRIsSpilledForFarJump(true);
   }
 }
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index d763d17a506f..6fdc5eff5e47 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -54,8 +54,8 @@ public:
                                  unsigned &FrameReg, int SPAdj) const;
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
 
   void adjustForSegmentedStacks(MachineFunction &MF,
                                 MachineBasicBlock &MBB) const override;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 50afb192b331..b110628a0a86 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -533,7 +533,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
     if (N.getOpcode() == ISD::FrameIndex) {
       // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       OffImm  = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
       return true;
     }
@@ -556,7 +557,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
       Base   = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
       return true;
@@ -702,7 +704,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
       Base = N.getOperand(0);
@@ -722,7 +725,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
       Offset = CurDAG->getRegister(0, MVT::i32);
 
@@ -900,7 +904,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
     Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), SDLoc(N),
@@ -915,7 +920,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
 
@@ -964,7 +970,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
       Base = N.getOperand(0);
@@ -981,7 +988,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     }
 
     ARM_AM::AddrOpc AddSub = ARM_AM::add;
@@ -1215,7 +1223,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
     MachineFrameInfo *MFI = MF->getFrameInfo();
     if (MFI->getObjectAlignment(FI) < 4)
       MFI->setObjectAlignment(FI, 4);
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
     return true;
   }
@@ -1237,7 +1246,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
         MachineFrameInfo *MFI = MF->getFrameInfo();
         if (MFI->getObjectAlignment(FI) < 4)
           MFI->setObjectAlignment(FI, 4);
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
       return true;
@@ -1285,7 +1295,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
     if (N.getOpcode() == ISD::FrameIndex) {
       // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       OffImm  = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
       return true;
     }
@@ -1314,7 +1325,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
       Base   = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
       return true;
@@ -1343,7 +1355,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
       return true;
@@ -1438,7 +1451,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base,
   Base = N.getOperand(0);
   if (Base.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
   }
 
   OffImm = CurDAG->getTargetConstant(RHSC/4, SDLoc(N), MVT::i32);
@@ -2510,7 +2524,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     if (UseCP) {
       SDValue CPIdx = CurDAG->getTargetConstantPool(
           ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
-          TLI->getPointerTy());
+          TLI->getPointerTy(CurDAG->getDataLayout()));
 
       SDNode *ResNode;
       if (Subtarget->isThumb()) {
@@ -2540,7 +2554,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ISD::FrameIndex: {
     // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    SDValue TFI = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     if (Subtarget->isThumb1Only()) {
       // Set the alignment of the frame object to 4, to avoid having to generate
       // more than one ADD
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 4b2105b7442f..e335784f6d87 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -60,11 +60,6 @@ STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
 
-cl::opt<bool>
-EnableARMLongCalls("arm-long-calls", cl::Hidden,
-  cl::desc("Generate calls via indirect call instructions"),
-  cl::init(false));
-
 static cl::opt<bool>
 ARMInterworking("arm-interworking", cl::Hidden,
   cl::desc("Enable / disable ARM interworking (for debugging only)"),
@@ -548,6 +543,27 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
 
+    // NEON does not have single instruction CTTZ for vectors.
+    setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
+
+    setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
+
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
+
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
+
     // NEON only has FMA instructions as of VFP4.
     if (!Subtarget->hasVFP4()) {
       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
@@ -1149,8 +1165,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
 }
 
-EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-  if (!VT.isVector()) return getPointerTy();
+EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                          EVT VT) const {
+  if (!VT.isVector())
+    return getPointerTy(DL);
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -1429,7 +1447,8 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
                                     ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                       StackPtr, PtrOff);
   return DAG.getStore(Chain, dl, Arg, PtrOff,
                       MachinePointerInfo::getStack(LocMemOffset),
                       false, false, 0);
@@ -1453,7 +1472,8 @@ void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
   else {
     assert(NextVA.isMemLoc());
     if (!StackPtr.getNode())
-      StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+      StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
+                                    getPointerTy(DAG.getDataLayout()));
 
     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
                                            dl, DAG, NextVA,
@@ -1526,7 +1546,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getCALLSEQ_START(Chain,
                                  DAG.getIntPtrConstant(NumBytes, dl, true), dl);
 
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+  SDValue StackPtr =
+      DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
 
   RegsToPassVector RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
@@ -1607,7 +1628,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         unsigned RegBegin, RegEnd;
         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
 
-        EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+        EVT PtrVT =
+            DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
         unsigned int i, j;
         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
@@ -1628,12 +1650,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       }
 
       if (Flags.getByValSize() > 4*offset) {
+        auto PtrVT = getPointerTy(DAG.getDataLayout());
         unsigned LocMemOffset = VA.getLocMemOffset();
         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-        SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
-                                  StkPtrOff);
+        SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
-        SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
+        SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
                                            MVT::i32);
         SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
@@ -1693,8 +1715,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool isARMFunc = false;
   bool isLocalARMFunc = false;
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  auto PtrVt = getPointerTy(DAG.getDataLayout());
 
-  if (EnableARMLongCalls) {
+  if (Subtarget->genLongCalls()) {
     assert((Subtarget->isTargetWindows() ||
             getTargetMachine().getRelocationModel() == Reloc::Static) &&
            "long-calls with non-static relocation model!");
@@ -1709,12 +1732,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
 
       // Get the address of the callee into a register
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-      Callee = DAG.getLoad(getPointerTy(), dl,
-                           DAG.getEntryNode(), CPAddr,
-                           MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
+      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(), false, false,
+                           false, 0);
     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
       const char *Sym = S->getSymbol();
 
@@ -1724,29 +1746,28 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
                                       ARMPCLabelIndex, 0);
       // Get the address of the callee into a register
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-      Callee = DAG.getLoad(getPointerTy(), dl,
-                           DAG.getEntryNode(), CPAddr,
-                           MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
+      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(), false, false,
+                           false, 0);
     }
   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = G->getGlobal();
     isDirect = true;
-    bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
-    bool isStub = (isExt && Subtarget->isTargetMachO()) &&
+    bool isDef = GV->isStrongDefinitionForLinker();
+    bool isStub = (!isDef && Subtarget->isTargetMachO()) &&
                    getTargetMachine().getRelocationModel() != Reloc::Static;
     isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
     // ARM call to a local ARM function is predicable.
-    isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
+    isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
     // tBX takes a register source operand.
     if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
       assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
-      Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(),
-                           DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
-                                                      0, ARMII::MO_NONLAZY));
-      Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
+      Callee = DAG.getNode(
+          ARMISD::WrapperPIC, dl, PtrVt,
+          DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
+      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee,
                            MachinePointerInfo::getGOT(), false, false, true, 0);
     } else if (Subtarget->isTargetCOFF()) {
       assert(Subtarget->isTargetWindows() &&
@@ -1754,20 +1775,20 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       unsigned TargetFlags = GV->hasDLLImportStorageClass()
                                  ? ARMII::MO_DLLIMPORT
                                  : ARMII::MO_NO_FLAG;
-      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), /*Offset=*/0,
-                                          TargetFlags);
+      Callee =
+          DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags);
       if (GV->hasDLLImportStorageClass())
-        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                             DAG.getNode(ARMISD::Wrapper, dl, getPointerTy(),
-                                         Callee), MachinePointerInfo::getGOT(),
-                             false, false, false, 0);
+        Callee =
+            DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
+                        DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
+                        MachinePointerInfo::getGOT(), false, false, false, 0);
     } else {
       // On ELF targets for PIC code, direct calls should go through the PLT
       unsigned OpFlags = 0;
       if (Subtarget->isTargetELF() &&
           getTargetMachine().getRelocationModel() == Reloc::PIC_)
         OpFlags = ARMII::MO_PLT;
-      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
+      Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags);
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     isDirect = true;
@@ -1781,22 +1802,20 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       ARMConstantPoolValue *CPV =
         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
                                       ARMPCLabelIndex, 4);
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-      Callee = DAG.getLoad(getPointerTy(), dl,
-                           DAG.getEntryNode(), CPAddr,
-                           MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
+      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(), false, false,
+                           false, 0);
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
-      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
-                           getPointerTy(), Callee, PICLabel);
+      Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
     } else {
       unsigned OpFlags = 0;
       // On ELF targets for PIC code, direct calls should go through the PLT
       if (Subtarget->isTargetELF() &&
                   getTargetMachine().getRelocationModel() == Reloc::PIC_)
         OpFlags = ARMII::MO_PLT;
-      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags);
+      Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags);
     }
   }
 
@@ -2433,7 +2452,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned ARMPCLabelIndex = 0;
   SDLoc DL(Op);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
   SDValue CPAddr;
@@ -2462,7 +2481,7 @@ SDValue
 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                                  SelectionDAG &DAG) const {
   SDLoc dl(GA);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -2508,7 +2527,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
   SDLoc dl(GA);
   SDValue Offset;
   SDValue Chain = DAG.getEntryNode();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   // Get the Thread Pointer
   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
 
@@ -2574,7 +2593,7 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
                                                  SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
@@ -2617,7 +2636,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
 
 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
                                                     SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
@@ -2648,7 +2667,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   const ARMII::TOF TargetFlags =
     (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result;
   SDLoc DL(Op);
 
@@ -2672,7 +2691,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
   ARMConstantPoolValue *CPV =
@@ -2716,14 +2735,14 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1));
   }
   case Intrinsic::arm_thread_pointer: {
-    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
   }
   case Intrinsic::eh_sjlj_lsda: {
     MachineFunction &MF = DAG.getMachineFunction();
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-    EVT PtrVT = getPointerTy();
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
     Reloc::Model RelocM = getTargetMachine().getRelocationModel();
     SDValue CPAddr;
     unsigned PCAdj = (RelocM != Reloc::PIC_)
@@ -2820,7 +2839,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
   SDLoc dl(Op);
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
@@ -2850,7 +2869,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
     int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true);
 
     // Create load node to retrieve arguments from the stack.
-    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN,
                             MachinePointerInfo::getFixedStack(FI),
                             false, false, false, 0);
@@ -2904,8 +2923,9 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
   if (REnd != RBegin)
     ArgOffset = -4 * (ARM::R4 - RBegin);
 
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
   int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false);
-  SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy());
+  SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
 
   SmallVector<SDValue, 4> MemOps;
   const TargetRegisterClass *RC =
@@ -2918,8 +2938,7 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
         DAG.getStore(Val.getValue(1), dl, Val, FIN,
                      MachinePointerInfo(OrigArg, 4 * i), false, false, 0);
     MemOps.push_back(Store);
-    FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
-                      DAG.getConstant(4, dl, getPointerTy()));
+    FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
   }
 
   if (!MemOps.empty())
@@ -3013,6 +3032,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
 
   unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
   AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -3035,7 +3055,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           SDValue ArgValue2;
           if (VA.isMemLoc()) {
             int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
-            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
                                     MachinePointerInfo::getFixedStack(FI),
                                     false, false, false, 0);
@@ -3122,7 +3142,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
             int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, CurOrigArg,
                                             CurByValIndex, VA.getLocMemOffset(),
                                             Flags.getByValSize());
-            InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy()));
+            InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
             CCInfo.nextInRegsParam();
           } else {
             unsigned FIOffset = VA.getLocMemOffset();
@@ -3130,7 +3150,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                                             FIOffset, true);
 
             // Create load nodes to retrieve arguments from the stack.
-            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
             InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
                                          MachinePointerInfo::getFixedStack(FI),
                                          false, false, false, 0));
@@ -3855,7 +3875,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Index = Op.getOperand(2);
   SDLoc dl(Op);
 
-  EVT PTy = getPointerTy();
+  EVT PTy = getPointerTy(DAG.getDataLayout());
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
@@ -4102,8 +4122,8 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned ARMTargetLowering::getRegisterByName(const char* RegName,
-                                              EVT VT) const {
+unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              SelectionDAG &DAG) const {
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("sp", ARM::SP)
                        .Default(0);
@@ -4163,7 +4183,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
   // Turn f64->i64 into VMOVRRD.
   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
     SDValue Cvt;
-    if (TLI.isBigEndian() && SrcVT.isVector() &&
+    if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
         SrcVT.getVectorNumElements() > 1)
       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
                         DAG.getVTList(MVT::i32, MVT::i32),
@@ -4283,8 +4303,82 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 
 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
                          const ARMSubtarget *ST) {
-  EVT VT = N->getValueType(0);
   SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  if (VT.isVector()) {
+    assert(ST->hasNEON());
+
+    // Compute the least significant set bit: LSB = X & -X
+    SDValue X = N->getOperand(0);
+    SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
+    SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
+
+    EVT ElemTy = VT.getVectorElementType();
+
+    if (ElemTy == MVT::i8) {
+      // Compute with: cttz(x) = ctpop(lsb - 1)
+      SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                                DAG.getTargetConstant(1, dl, ElemTy));
+      SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
+      return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
+    }
+
+    if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
+        (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
+      // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
+      unsigned NumBits = ElemTy.getSizeInBits();
+      SDValue WidthMinus1 =
+          DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                      DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
+      SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
+      return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
+    }
+
+    // Compute with: cttz(x) = ctpop(lsb - 1)
+
+    // Since we can only compute the number of bits in a byte with vcnt.8, we
+    // have to gather the result with pairwise addition (vpaddl) for i16, i32,
+    // and i64.
+
+    // Compute LSB - 1.
+    SDValue Bits;
+    if (ElemTy == MVT::i64) {
+      // Load constant 0xffff'ffff'ffff'ffff to register.
+      SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                               DAG.getTargetConstant(0x1eff, dl, MVT::i32));
+      Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
+    } else {
+      SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                                DAG.getTargetConstant(1, dl, ElemTy));
+      Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
+    }
+
+    // Count #bits with vcnt.8.
+    EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+    SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
+    SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
+
+    // Gather the #bits with vpaddl (pairwise add.)
+    EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+    SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
+        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+        Cnt8);
+    if (ElemTy == MVT::i16)
+      return Cnt16;
+
+    EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
+    SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
+        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+        Cnt16);
+    if (ElemTy == MVT::i32)
+      return Cnt32;
+
+    assert(ElemTy == MVT::i64);
+    SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+        Cnt32);
+    return Cnt64;
+  }
 
   if (!ST->hasV6T2Ops())
     return SDValue();
@@ -4730,7 +4824,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
       ImmMask <<= 1;
     }
 
-    if (DAG.getTargetLoweringInfo().isBigEndian())
+    if (DAG.getDataLayout().isBigEndian())
       // swap higher and lower 32 bit word
       Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
 
@@ -5868,7 +5962,7 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
     if (BVN->getValueType(0) != MVT::v4i32 ||
         BVN->getOpcode() != ISD::BUILD_VECTOR)
       return false;
-    unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
+    unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
     unsigned HiElt = 1 - LoElt;
     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
@@ -6013,7 +6107,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
     SDNode *BVN = N->getOperand(0).getNode();
     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
-    unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
+    unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
     return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32,
                        BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
   }
@@ -6342,18 +6436,19 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   SDValue Arg = Op.getOperand(0);
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Pair of floats / doubles used to pass the result.
   StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
 
   // Create stack object for sret.
-  const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy);
-  const unsigned StackAlign = TLI.getDataLayout()->getPrefTypeAlignment(RetTy);
+  auto &DL = DAG.getDataLayout();
+  const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
+  const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
   int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
-  SDValue SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy());
+  SDValue SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
 
   ArgListTy Args;
   ArgListEntry Entry;
@@ -6373,7 +6468,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
 
   const char *LibcallName  = (ArgVT == MVT::f64)
   ? "__sincos_stret" : "__sincosf_stret";
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
@@ -6387,7 +6482,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
                                 MachinePointerInfo(), false, false, false, 0);
 
   // Address of cos field.
-  SDValue Add = DAG.getNode(ISD::ADD, dl, getPointerTy(), SRet,
+  SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
   SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
                                 MachinePointerInfo(), false, false, false, 0);
@@ -6487,7 +6582,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
   case ISD::SRL_PARTS:
   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
-  case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
@@ -6845,9 +6941,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
 
       // MachineConstantPool wants an explicit alignment.
-      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
+      unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
       if (Align == 0)
-        Align = getDataLayout()->getTypeAllocSize(C->getType());
+        Align = MF->getDataLayout().getTypeAllocSize(C->getType());
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
@@ -6935,9 +7031,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
 
       // MachineConstantPool wants an explicit alignment.
-      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
+      unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
       if (Align == 0)
-        Align = getDataLayout()->getTypeAllocSize(C->getType());
+        Align = MF->getDataLayout().getTypeAllocSize(C->getType());
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
@@ -7313,9 +7409,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
 
     // MachineConstantPool wants an explicit alignment.
-    unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
+    unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
     if (Align == 0)
-      Align = getDataLayout()->getTypeAllocSize(C->getType());
+      Align = MF->getDataLayout().getTypeAllocSize(C->getType());
     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
     if (IsThumb1)
@@ -8001,7 +8097,7 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
   // Build operand list.
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
-                                TLI.getPointerTy()));
+                                TLI.getPointerTy(DAG.getDataLayout())));
 
   // Input is the vector.
   Ops.push_back(Vec);
@@ -8681,7 +8777,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
                                  std::min(4U, LD->getAlignment() / 2));
 
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
-    if (DCI.DAG.getTargetLoweringInfo().isBigEndian())
+    if (DCI.DAG.getDataLayout().isBigEndian())
       std::swap (NewLD1, NewLD2);
     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
     return Result;
@@ -9312,7 +9408,9 @@ static SDValue PerformSTORECombine(SDNode *N,
     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i < NumElems; ++i)
-      ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio;
+      ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
+                          ? (i + 1) * SizeRatio - 1
+                          : i * SizeRatio;
 
     // Can't shuffle using an illegal type.
     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
@@ -9339,8 +9437,8 @@ static SDValue PerformSTORECombine(SDNode *N,
     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
     SmallVector<SDValue, 8> Chains;
-    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, DL,
-                                        TLI.getPointerTy());
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
     SDValue BasePtr = St->getBasePtr();
 
     // Perform one or more big stores into memory.
@@ -9367,7 +9465,7 @@ static SDValue PerformSTORECombine(SDNode *N,
   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
       StVal.getNode()->hasOneUse()) {
     SelectionDAG  &DAG = DCI.DAG;
-    bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian();
+    bool isBigEndian = DAG.getDataLayout().isBigEndian();
     SDLoc DL(St);
     SDValue BasePtr = St->getBasePtr();
     SDValue NewST1 = DAG.getStore(St->getChain(), DL,
@@ -10078,7 +10176,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     // For any little-endian targets with neon, we can support unaligned ld/st
     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
     // A big-endian target may also explicitly support unaligned accesses
-    if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
+    if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
       if (Fast)
         *Fast = true;
       return true;
@@ -10317,10 +10415,10 @@ bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
 
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
-bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              Type *Ty,
+bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
                                               unsigned AS) const {
-  EVT VT = getValueType(Ty, true);
+  EVT VT = getValueType(DL, Ty, true);
   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
     return false;
 
@@ -10664,7 +10762,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 ARMTargetLowering::ConstraintType
-ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
+ARMTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:  break;
@@ -10723,10 +10821,8 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
 }
 
 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
-RCPair
-ARMTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                const std::string &Constraint,
-                                                MVT VT) const {
+RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC ARM Constraint Letters
     switch (Constraint[0]) {
@@ -10974,7 +11070,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   }
 
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
-                                         getPointerTy());
+                                         getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
 
@@ -11083,7 +11179,8 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::arm_neon_vld4lane: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
@@ -11103,12 +11200,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::arm_neon_vst4lane: {
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     unsigned NumElts = 0;
     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
@@ -11122,12 +11220,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   }
   case Intrinsic::arm_ldaex:
   case Intrinsic::arm_ldrex: {
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
     Info.vol = true;
     Info.readMem = true;
     Info.writeMem = false;
@@ -11135,12 +11234,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   }
   case Intrinsic::arm_stlex:
   case Intrinsic::arm_strex: {
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
     Info.vol = true;
     Info.readMem = false;
     Info.writeMem = true;
@@ -11427,9 +11527,9 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   VectorType *VecTy = Shuffles[0]->getType();
   Type *EltTy = VecTy->getVectorElementType();
 
-  const DataLayout *DL = getDataLayout();
-  unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy);
-  bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64;
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
+  bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
 
   // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't
   // support i64/f64 element).
@@ -11439,8 +11539,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
   if (EltTy->isPointerTy())
-    VecTy = VectorType::get(DL->getIntPtrType(EltTy),
-                            VecTy->getVectorNumElements());
+    VecTy =
+        VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
                                             Intrinsic::arm_neon_vld3,
@@ -11517,9 +11617,9 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   Type *EltTy = VecTy->getVectorElementType();
   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
 
-  const DataLayout *DL = getDataLayout();
-  unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy);
-  bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64;
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+  bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
 
   // Skip illegal sub vector types and vector types of i64/f64 element (vstN
   // doesn't support i64/f64 element).
@@ -11533,7 +11633,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
   // vectors to integer vectors.
   if (EltTy->isPointerTy()) {
-    Type *IntTy = DL->getIntPtrType(EltTy);
+    Type *IntTy = DL.getIntPtrType(EltTy);
 
     // Convert to the corresponding integer vector.
     Type *IntVecTy =
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 74396392f8e3..efc9020c193a 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -249,7 +249,8 @@ namespace llvm {
     }
 
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
-    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
 
     MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
@@ -286,8 +287,8 @@ namespace llvm {
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                               unsigned AS) const override;
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
     bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
@@ -324,8 +325,7 @@ namespace llvm {
 
     bool ExpandInlineAsm(CallInst *CI) const override;
 
-    ConstraintType
-      getConstraintType(const std::string &Constraint) const override;
+    ConstraintType getConstraintType(StringRef Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
@@ -334,8 +334,7 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
@@ -345,8 +344,8 @@ namespace llvm {
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
 
-    unsigned getInlineAsmMemConstraint(
-        const std::string &ConstraintCode) const override {
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "Q")
         return InlineAsm::Constraint_Q;
       else if (ConstraintCode.size() == 2) {
@@ -533,7 +532,8 @@ namespace llvm {
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
-    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
 
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
     /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index b8cac135baf6..61c45af26fe1 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -306,8 +306,8 @@ def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">;
 def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">;
 
-def IsLE             : Predicate<"getTargetLowering()->isLittleEndian()">;
-def IsBE             : Predicate<"getTargetLowering()->isBigEndian()">;
+def IsLE             : Predicate<"MF->getDataLayout().isLittleEndian()">;
+def IsBE             : Predicate<"MF->getDataLayout().isBigEndian()">;
 
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 245c9e869bf6..37352810c99f 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -31,11 +31,13 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -65,12 +67,18 @@ namespace {
     static char ID;
     ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
 
+    const MachineFunction *MF;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
+    const MachineRegisterInfo *MRI;
     const ARMSubtarget *STI;
     const TargetLowering *TL;
     ARMFunctionInfo *AFI;
-    RegScavenger *RS;
+    LivePhysRegs LiveRegs;
+    RegisterClassInfo RegClassInfo;
+    MachineBasicBlock::const_iterator LiveRegPos;
+    bool LiveRegsValid;
+    bool RegClassInfoValid;
     bool isThumb1, isThumb2;
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -80,64 +88,60 @@ namespace {
     }
 
   private:
+    /// A set of load/store MachineInstrs with same base register sorted by
+    /// offset.
     struct MemOpQueueEntry {
-      int Offset;
-      unsigned Reg;
-      bool isKill;
-      unsigned Position;
-      MachineBasicBlock::iterator MBBI;
-      bool Merged;
-      MemOpQueueEntry(int o, unsigned r, bool k, unsigned p,
-                      MachineBasicBlock::iterator i)
-        : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {}
+      MachineInstr *MI;
+      int Offset;        ///< Load/Store offset.
+      unsigned Position; ///< Position as counted from end of basic block.
+      MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position)
+        : MI(MI), Offset(Offset), Position(Position) {}
     };
     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
-    typedef MemOpQueue::iterator MemOpQueueIter;
 
-    void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs,
-                          const MemOpQueue &MemOps, unsigned DefReg,
-                          unsigned RangeBegin, unsigned RangeEnd);
+    /// A set of MachineInstrs that fulfill (nearly all) conditions to get
+    /// merged into a LDM/STM.
+    struct MergeCandidate {
+      /// List of instructions ordered by load/store offset.
+      SmallVector<MachineInstr*, 4> Instrs;
+      /// Index in Instrs of the instruction being latest in the schedule.
+      unsigned LatestMIIdx;
+      /// Index in Instrs of the instruction being earliest in the schedule.
+      unsigned EarliestMIIdx;
+      /// Index into the basic block where the merged instruction will be
+      /// inserted. (See MemOpQueueEntry.Position)
+      unsigned InsertPos;
+      /// Whether the instructions can be merged into a ldm/stm instruction.
+      bool CanMergeToLSMulti;
+      /// Whether the instructions can be merged into a ldrd/strd instruction.
+      bool CanMergeToLSDouble;
+    };
+    SpecificBumpPtrAllocator<MergeCandidate> Allocator;
+    SmallVector<const MergeCandidate*,4> Candidates;
+    SmallVector<MachineInstr*,4> MergeBaseCandidates;
+
+    void moveLiveRegsBefore(const MachineBasicBlock &MBB,
+                            MachineBasicBlock::const_iterator Before);
+    unsigned findFreeReg(const TargetRegisterClass &RegClass);
     void UpdateBaseRegUses(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
-                           DebugLoc dl, unsigned Base, unsigned WordOffset,
+                           DebugLoc DL, unsigned Base, unsigned WordOffset,
                            ARMCC::CondCodes Pred, unsigned PredReg);
-    bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                  int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
-                  ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
-                  DebugLoc dl,
-                  ArrayRef<std::pair<unsigned, bool> > Regs,
-                  ArrayRef<unsigned> ImpDefs);
-    void MergeOpsUpdate(MachineBasicBlock &MBB,
-                        MemOpQueue &MemOps,
-                        unsigned memOpsBegin,
-                        unsigned memOpsEnd,
-                        unsigned insertAfter,
-                        int Offset,
-                        unsigned Base,
-                        bool BaseKill,
-                        unsigned Opcode,
-                        ARMCC::CondCodes Pred,
-                        unsigned PredReg,
-                        unsigned Scratch,
-                        DebugLoc dl,
-                        SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
-    void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
-                      unsigned Opcode, unsigned Size,
-                      ARMCC::CondCodes Pred, unsigned PredReg,
-                      unsigned Scratch, MemOpQueue &MemOps,
-                      SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
-    void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
+    MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB,
+        MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
+        bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
+        DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs);
+    MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB,
+        MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
+        bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
+        DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const;
+    void FormCandidates(const MemOpQueue &MemOps);
+    MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator &MBBI);
-    bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MBBI,
-                                  const TargetInstrInfo *TII,
-                                  bool &Advance,
-                                  MachineBasicBlock::iterator &I);
-    bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   bool &Advance,
-                                   MachineBasicBlock::iterator &I);
+    bool MergeBaseUpdateLoadStore(MachineInstr *MI);
+    bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
+    bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
     bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
     bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
   };
@@ -185,6 +189,14 @@ static int getMemoryOpOffset(const MachineInstr *MI) {
   return Offset;
 }
 
+static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) {
+  return MI.getOperand(1);
+}
+
+static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) {
+  return MI.getOperand(0);
+}
+
 static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) {
   switch (Opcode) {
   default: llvm_unreachable("Unhandled opcode!");
@@ -348,6 +360,10 @@ static bool isi32Store(unsigned Opc) {
   return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
 }
 
+static bool isLoadSingle(unsigned Opc) {
+  return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
+}
+
 static unsigned getImmScale(unsigned Opc) {
   switch (Opc) {
   default: llvm_unreachable("Unhandled opcode!");
@@ -365,12 +381,55 @@ static unsigned getImmScale(unsigned Opc) {
   }
 }
 
+static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return 0;
+  case ARM::LDRi12:
+  case ARM::STRi12:
+  case ARM::tLDRi:
+  case ARM::tSTRi:
+  case ARM::tLDRspi:
+  case ARM::tSTRspi:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+  case ARM::VLDRS:
+  case ARM::VSTRS:
+    return 4;
+  case ARM::VLDRD:
+  case ARM::VSTRD:
+    return 8;
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::VLDMSIA:
+  case ARM::VSTMSIA:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
+  case ARM::VLDMDIA:
+  case ARM::VSTMDIA:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
+  }
+}
+
 /// Update future uses of the base register with the offset introduced
 /// due to writeback. This function only works on Thumb1.
 void
 ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
-                                   DebugLoc dl, unsigned Base,
+                                   DebugLoc DL, unsigned Base,
                                    unsigned WordOffset,
                                    ARMCC::CondCodes Pred, unsigned PredReg) {
   assert(isThumb1 && "Can only update base register uses for Thumb1!");
@@ -398,7 +457,7 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
         Offset = MO.getImm() - WordOffset * getImmScale(Opc);
 
         // If storing the base register, it needs to be reset first.
-        unsigned InstrSrcReg = MBBI->getOperand(0).getReg();
+        unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
 
         if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
           MO.setImm(Offset);
@@ -439,7 +498,7 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
 
     if (InsertSub) {
       // An instruction above couldn't be updated, so insert a sub.
-      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true)
+      AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
         .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
       return;
     }
@@ -457,31 +516,65 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
     // See PR21029.
     if (MBBI != MBB.end()) --MBBI;
     AddDefaultT1CC(
-      BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true)
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
       .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
   }
 }
 
+/// Return the first register of class \p RegClass that is not in \p Regs.
+unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) {
+  if (!RegClassInfoValid) {
+    RegClassInfo.runOnMachineFunction(*MF);
+    RegClassInfoValid = true;
+  }
+
+  for (unsigned Reg : RegClassInfo.getOrder(&RegClass))
+    if (!LiveRegs.contains(Reg))
+      return Reg;
+  return 0;
+}
+
+/// Compute live registers just before instruction \p Before (in normal schedule
+/// direction). Computes backwards so multiple queries in the same block must
+/// come in reverse order.
+void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
+    MachineBasicBlock::const_iterator Before) {
+  // Initialize if we never queried in this block.
+  if (!LiveRegsValid) {
+    LiveRegs.init(TRI);
+    LiveRegs.addLiveOuts(&MBB, true);
+    LiveRegPos = MBB.end();
+    LiveRegsValid = true;
+  }
+  // Move backward just before the "Before" position.
+  while (LiveRegPos != Before) {
+    --LiveRegPos;
+    LiveRegs.stepBackward(*LiveRegPos);
+  }
+}
+
+static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
+                        unsigned Reg) {
+  for (const std::pair<unsigned, bool> &R : Regs)
+    if (R.first == Reg)
+      return true;
+  return false;
+}
+
 /// Create and insert a LDM or STM with Base as base register and registers in
 /// Regs as the register operands that would be loaded / stored.  It returns
 /// true if the transformation is done.
-bool
-ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI,
-                          int Offset, unsigned Base, bool BaseKill,
-                          unsigned Opcode, ARMCC::CondCodes Pred,
-                          unsigned PredReg, unsigned Scratch, DebugLoc dl,
-                          ArrayRef<std::pair<unsigned, bool> > Regs,
-                          ArrayRef<unsigned> ImpDefs) {
-  // Only a single register to load / store. Don't bother.
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
+    bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
+    DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) {
   unsigned NumRegs = Regs.size();
-  if (NumRegs <= 1)
-    return false;
+  assert(NumRegs > 1);
 
   // For Thumb1 targets, it might be necessary to clobber the CPSR to merge.
   // Compute liveness information for that register to make the decision.
   bool SafeToClobberCPSR = !isThumb1 ||
-    (MBB.computeRegisterLiveness(TRI, ARM::CPSR, std::prev(MBBI), 15) ==
+    (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) ==
      MachineBasicBlock::LQR_Dead);
 
   bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
@@ -489,17 +582,14 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   // Exception: If the base register is in the input reglist, Thumb1 LDM is
   // non-writeback.
   // It's also not possible to merge an STR of the base register in Thumb1.
-  if (isThumb1)
-    for (const std::pair<unsigned, bool> &R : Regs)
-      if (Base == R.first) {
-        assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
-        if (Opcode == ARM::tLDRi) {
-          Writeback = false;
-          break;
-        } else if (Opcode == ARM::tSTRi) {
-          return false;
-        }
-      }
+  if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
+    assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
+    if (Opcode == ARM::tLDRi) {
+      Writeback = false;
+    } else if (Opcode == ARM::tSTRi) {
+      return nullptr;
+    }
+  }
 
   ARM_AM::AMSubMode Mode = ARM_AM::ia;
   // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
@@ -516,18 +606,18 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
     // Check if this is a supported opcode before inserting instructions to
     // calculate a new base register.
-    if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false;
+    if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr;
 
     // If starting offset isn't zero, insert a MI to materialize a new base.
     // But only do so if it is cost effective, i.e. merging more than two
     // loads / stores.
     if (NumRegs <= 2)
-      return false;
+      return nullptr;
 
     // On Thumb1, it's not worth materializing a new base register without
     // clobbering the CPSR (i.e. not using ADDS/SUBS).
     if (!SafeToClobberCPSR)
-      return false;
+      return nullptr;
 
     unsigned NewBase;
     if (isi32Load(Opcode)) {
@@ -535,10 +625,17 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       // use as the new base.
       NewBase = Regs[NumRegs-1].first;
     } else {
-      // Use the scratch register to use as a new base.
-      NewBase = Scratch;
+      // Find a free register that we can use as scratch register.
+      moveLiveRegsBefore(MBB, InsertBefore);
+      // The merged instruction does not exist yet but will use several Regs if
+      // it is a Store.
+      if (!isLoadSingle(Opcode))
+        for (const std::pair<unsigned, bool> &R : Regs)
+          LiveRegs.addReg(R.first);
+
+      NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass);
       if (NewBase == 0)
-        return false;
+        return nullptr;
     }
 
     int BaseOpc =
@@ -557,7 +654,12 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
 
     if (!TL->isLegalAddImmediate(Offset))
       // FIXME: Try add with register operand?
-      return false; // Probably not worth it then.
+      return nullptr; // Probably not worth it then.
+
+    // We can only append a kill flag to the add/sub input if the value is not
+    // used in the register list of the stm as well.
+    bool KillOldBase = BaseKill &&
+      (!isi32Store(Opcode) || !ContainsReg(Regs, Base));
 
     if (isThumb1) {
       // Thumb1: depending on immediate size, use either
@@ -572,43 +674,44 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
             !STI->hasV6Ops()) {
           // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
           if (Pred != ARMCC::AL)
-            return false;
-          BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVSr), NewBase)
-            .addReg(Base, getKillRegState(BaseKill));
+            return nullptr;
+          BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase)
+            .addReg(Base, getKillRegState(KillOldBase));
         } else
-          BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)
-            .addReg(Base, getKillRegState(BaseKill))
+          BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
+            .addReg(Base, getKillRegState(KillOldBase))
             .addImm(Pred).addReg(PredReg);
 
-        // Set up BaseKill and Base correctly to insert the ADDS/SUBS below.
+        // The following ADDS/SUBS becomes an update.
         Base = NewBase;
-        BaseKill = false;
+        KillOldBase = true;
       }
       if (BaseOpc == ARM::tADDrSPi) {
         assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
-        BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
-          .addReg(Base, getKillRegState(BaseKill)).addImm(Offset/4)
+        BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
+          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4)
           .addImm(Pred).addReg(PredReg);
       } else
-        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase), true)
-          .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
+        AddDefaultT1CC(
+          BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true)
+          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
           .addImm(Pred).addReg(PredReg);
     } else {
-      BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
-        .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
+      BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
+        .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
         .addImm(Pred).addReg(PredReg).addReg(0);
     }
     Base = NewBase;
     BaseKill = true; // New base is always killed straight away.
   }
 
-  bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
-                Opcode == ARM::VLDRD);
+  bool isDef = isLoadSingle(Opcode);
 
   // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
   // base register writeback.
   Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
-  if (!Opcode) return false;
+  if (!Opcode)
+    return nullptr;
 
   // Check if a Thumb1 LDM/STM merge is safe. This is the case if:
   // - There is no writeback (LDM of base register),
@@ -619,7 +722,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   // It's safe to return here since the code to materialize a new base register
   // above is also conditional on SafeToClobberCPSR.
   if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill)
-    return false;
+    return nullptr;
 
   MachineInstrBuilder MIB;
 
@@ -628,7 +731,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       // Update tLDMIA with writeback if necessary.
       Opcode = ARM::tLDMIA_UPD;
 
-    MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
+    MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
 
     // Thumb1: we might need to set base writeback when building the MI.
     MIB.addReg(Base, getDefRegState(true))
@@ -637,381 +740,257 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
     // The base isn't dead after a merged instruction with writeback.
     // Insert a sub instruction after the newly formed instruction to reset.
     if (!BaseKill)
-      UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
+      UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg);
 
   } else {
     // No writeback, simply build the MachineInstr.
-    MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
+    MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
     MIB.addReg(Base, getKillRegState(BaseKill));
   }
 
   MIB.addImm(Pred).addReg(PredReg);
 
   for (const std::pair<unsigned, bool> &R : Regs)
-    MIB = MIB.addReg(R.first, getDefRegState(isDef)
-                     | getKillRegState(R.second));
+    MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second));
 
-  // Add implicit defs for super-registers.
-  for (unsigned ImpDef : ImpDefs)
-    MIB.addReg(ImpDef, RegState::ImplicitDefine);
-
-  return true;
+  return MIB.getInstr();
 }
 
-/// Find all instructions using a given imp-def within a range.
-///
-/// We are trying to combine a range of instructions, one of which (located at
-/// position RangeBegin) implicitly defines a register. The final LDM/STM will
-/// be placed at RangeEnd, and so any uses of this definition between RangeStart
-/// and RangeEnd must be modified to use an undefined value.
-///
-/// The live range continues until we find a second definition or one of the
-/// uses we find is a kill. Unfortunately MemOps is not sorted by Position, so
-/// we must consider all uses and decide which are relevant in a second pass.
-void ARMLoadStoreOpt::findUsesOfImpDef(
-    SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, const MemOpQueue &MemOps,
-    unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd) {
-  std::map<unsigned, MachineOperand *> Uses;
-  unsigned LastLivePos = RangeEnd;
-
-  // First we find all uses of this register with Position between RangeBegin
-  // and RangeEnd, any or all of these could be uses of a definition at
-  // RangeBegin. We also record the latest position a definition at RangeBegin
-  // would be considered live.
-  for (unsigned i = 0; i < MemOps.size(); ++i) {
-    MachineInstr &MI = *MemOps[i].MBBI;
-    unsigned MIPosition = MemOps[i].Position;
-    if (MIPosition <= RangeBegin || MIPosition > RangeEnd)
-      continue;
-
-    // If this instruction defines the register, then any later use will be of
-    // that definition rather than ours.
-    if (MI.definesRegister(DefReg))
-      LastLivePos = std::min(LastLivePos, MIPosition);
-
-    MachineOperand *UseOp = MI.findRegisterUseOperand(DefReg);
-    if (!UseOp)
-      continue;
-
-    // If this instruction kills the register then (assuming liveness is
-    // correct when we start) we don't need to think about anything after here.
-    if (UseOp->isKill())
-      LastLivePos = std::min(LastLivePos, MIPosition);
-
-    Uses[MIPosition] = UseOp;
-  }
-
-  // Now we traverse the list of all uses, and append the ones that actually use
-  // our definition to the requested list.
-  for (std::map<unsigned, MachineOperand *>::iterator I = Uses.begin(),
-                                                      E = Uses.end();
-       I != E; ++I) {
-    // List is sorted by position so once we've found one out of range there
-    // will be no more to consider.
-    if (I->first > LastLivePos)
-      break;
-    UsesOfImpDefs.push_back(I->second);
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
+    bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
+    DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const {
+  bool IsLoad = isi32Load(Opcode);
+  assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
+  unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
+
+  assert(Regs.size() == 2);
+  MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL,
+                                    TII->get(LoadStoreOpcode));
+  if (IsLoad) {
+    MIB.addReg(Regs[0].first, RegState::Define)
+       .addReg(Regs[1].first, RegState::Define);
+  } else {
+    MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second))
+       .addReg(Regs[1].first, getKillRegState(Regs[1].second));
   }
+  MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+  return MIB.getInstr();
 }
 
 /// Call MergeOps and update MemOps and merges accordingly on success.
-void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
-                                     MemOpQueue &memOps,
-                                     unsigned memOpsBegin, unsigned memOpsEnd,
-                                     unsigned insertAfter, int Offset,
-                                     unsigned Base, bool BaseKill,
-                                     unsigned Opcode,
-                                     ARMCC::CondCodes Pred, unsigned PredReg,
-                                     unsigned Scratch,
-                                     DebugLoc dl,
-                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
-  // First calculate which of the registers should be killed by the merged
-  // instruction.
-  const unsigned insertPos = memOps[insertAfter].Position;
-  SmallSet<unsigned, 4> KilledRegs;
-  DenseMap<unsigned, unsigned> Killer;
-  for (unsigned i = 0, e = memOps.size(); i != e; ++i) {
-    if (i == memOpsBegin) {
-      i = memOpsEnd;
-      if (i == e)
-        break;
-    }
-    if (memOps[i].Position < insertPos && memOps[i].isKill) {
-      unsigned Reg = memOps[i].Reg;
+MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
+  const MachineInstr *First = Cand.Instrs.front();
+  unsigned Opcode = First->getOpcode();
+  bool IsLoad = isLoadSingle(Opcode);
+  SmallVector<std::pair<unsigned, bool>, 8> Regs;
+  SmallVector<unsigned, 4> ImpDefs;
+  DenseSet<unsigned> KilledRegs;
+  // Determine list of registers and list of implicit super-register defs.
+  for (const MachineInstr *MI : Cand.Instrs) {
+    const MachineOperand &MO = getLoadStoreRegOp(*MI);
+    unsigned Reg = MO.getReg();
+    bool IsKill = MO.isKill();
+    if (IsKill)
       KilledRegs.insert(Reg);
-      Killer[Reg] = i;
+    Regs.push_back(std::make_pair(Reg, IsKill));
+
+    if (IsLoad) {
+      // Collect any implicit defs of super-registers, after merging we can't
+      // be sure anymore that we properly preserved these live ranges and must
+      // removed these implicit operands.
+      for (const MachineOperand &MO : MI->implicit_operands()) {
+        if (!MO.isReg() || !MO.isDef() || MO.isDead())
+          continue;
+        assert(MO.isImplicit());
+        unsigned DefReg = MO.getReg();
+
+        if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end())
+          continue;
+        // We can ignore cases where the super-reg is read and written.
+        if (MI->readsRegister(DefReg))
+          continue;
+        ImpDefs.push_back(DefReg);
+      }
     }
   }
 
-  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
-    MachineOperand &TransferOp = memOps[i].MBBI->getOperand(0);
-    if (TransferOp.isUse() && TransferOp.getReg() == Base)
-      BaseKill = false;
+  // Attempt the merge.
+  typedef MachineBasicBlock::iterator iterator;
+  MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
+  iterator InsertBefore = std::next(iterator(LatestMI));
+  MachineBasicBlock &MBB = *LatestMI->getParent();
+  unsigned Offset = getMemoryOpOffset(First);
+  unsigned Base = getLoadStoreBaseOp(*First).getReg();
+  bool BaseKill = LatestMI->killsRegister(Base);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg);
+  DebugLoc DL = First->getDebugLoc();
+  MachineInstr *Merged = nullptr;
+  if (Cand.CanMergeToLSDouble)
+    Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill,
+                                   Opcode, Pred, PredReg, DL, Regs);
+  if (!Merged && Cand.CanMergeToLSMulti)
+    Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill,
+                                  Opcode, Pred, PredReg, DL, Regs);
+  if (!Merged)
+    return nullptr;
+
+  // Determine earliest instruction that will get removed. We then keep an
+  // iterator just above it so the following erases don't invalidated it.
+  iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]);
+  bool EarliestAtBegin = false;
+  if (EarliestI == MBB.begin()) {
+    EarliestAtBegin = true;
+  } else {
+    EarliestI = std::prev(EarliestI);
   }
 
-  SmallVector<std::pair<unsigned, bool>, 8> Regs;
-  SmallVector<unsigned, 8> ImpDefs;
-  SmallVector<MachineOperand *, 8> UsesOfImpDefs;
-  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
-    unsigned Reg = memOps[i].Reg;
-    // If we are inserting the merged operation after an operation that
-    // uses the same register, make sure to transfer any kill flag.
-    bool isKill = memOps[i].isKill || KilledRegs.count(Reg);
-    Regs.push_back(std::make_pair(Reg, isKill));
-
-    // Collect any implicit defs of super-registers. They must be preserved.
-    for (const MachineOperand &MO : memOps[i].MBBI->operands()) {
-      if (!MO.isReg() || !MO.isDef() || !MO.isImplicit() || MO.isDead())
-        continue;
-      unsigned DefReg = MO.getReg();
-      if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end())
-        ImpDefs.push_back(DefReg);
-
-      // There may be other uses of the definition between this instruction and
-      // the eventual LDM/STM position. These should be marked undef if the
-      // merge takes place.
-      findUsesOfImpDef(UsesOfImpDefs, memOps, DefReg, memOps[i].Position,
-                       insertPos);
+  // Remove instructions which have been merged.
+  for (MachineInstr *MI : Cand.Instrs)
+    MBB.erase(MI);
+
+  // Determine range between the earliest removed instruction and the new one.
+  if (EarliestAtBegin)
+    EarliestI = MBB.begin();
+  else
+    EarliestI = std::next(EarliestI);
+  auto FixupRange = make_range(EarliestI, iterator(Merged));
+
+  if (isLoadSingle(Opcode)) {
+    // If the previous loads defined a super-reg, then we have to mark earlier
+    // operands undef; Replicate the super-reg def on the merged instruction.
+    for (MachineInstr &MI : FixupRange) {
+      for (unsigned &ImpDefReg : ImpDefs) {
+        for (MachineOperand &MO : MI.implicit_operands()) {
+          if (!MO.isReg() || MO.getReg() != ImpDefReg)
+            continue;
+          if (MO.readsReg())
+            MO.setIsUndef();
+          else if (MO.isDef())
+            ImpDefReg = 0;
+        }
+      }
     }
-  }
 
-  // Try to do the merge.
-  MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
-  ++Loc;
-  if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
-                Pred, PredReg, Scratch, dl, Regs, ImpDefs))
-    return;
-
-  // Merge succeeded, update records.
-  Merges.push_back(std::prev(Loc));
-
-  // In gathering loads together, we may have moved the imp-def of a register
-  // past one of its uses. This is OK, since we know better than the rest of
-  // LLVM what's OK with ARM loads and stores; but we still have to adjust the
-  // affected uses.
-  for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(),
-                                                   E = UsesOfImpDefs.end();
-                                                   I != E; ++I)
-    (*I)->setIsUndef();
-
-  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
-    // Remove kill flags from any memops that come before insertPos.
-    if (Regs[i-memOpsBegin].second) {
-      unsigned Reg = Regs[i-memOpsBegin].first;
-      if (KilledRegs.count(Reg)) {
-        unsigned j = Killer[Reg];
-        int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true);
-        assert(Idx >= 0 && "Cannot find killing operand");
-        memOps[j].MBBI->getOperand(Idx).setIsKill(false);
-        memOps[j].isKill = false;
+    MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged);
+    for (unsigned ImpDef : ImpDefs)
+      MIB.addReg(ImpDef, RegState::ImplicitDefine);
+  } else {
+    // Remove kill flags: We are possibly storing the values later now.
+    assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD);
+    for (MachineInstr &MI : FixupRange) {
+      for (MachineOperand &MO : MI.uses()) {
+        if (!MO.isReg() || !MO.isKill())
+          continue;
+        if (KilledRegs.count(MO.getReg()))
+          MO.setIsKill(false);
       }
-      memOps[i].isKill = true;
     }
-    MBB.erase(memOps[i].MBBI);
-    // Update this memop to refer to the merged instruction.
-    // We may need to move kill flags again.
-    memOps[i].Merged = true;
-    memOps[i].MBBI = Merges.back();
-    memOps[i].Position = insertPos;
+    assert(ImpDefs.empty());
   }
 
-  // Update memOps offsets, since they may have been modified by MergeOps.
-  for (auto &MemOp : memOps) {
-    MemOp.Offset = getMemoryOpOffset(MemOp.MBBI);
-  }
+  return Merged;
 }
 
-/// Merge a number of load / store instructions into one or more load / store
-/// multiple instructions.
-void
-ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
-                         unsigned Base, unsigned Opcode, unsigned Size,
-                         ARMCC::CondCodes Pred, unsigned PredReg,
-                         unsigned Scratch, MemOpQueue &MemOps,
-                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
-  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
-  int Offset = MemOps[SIndex].Offset;
-  int SOffset = Offset;
-  unsigned insertAfter = SIndex;
-  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
-  DebugLoc dl = Loc->getDebugLoc();
-  const MachineOperand &PMO = Loc->getOperand(0);
-  unsigned PReg = PMO.getReg();
-  unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
-  unsigned Count = 1;
-  unsigned Limit = ~0U;
-  bool BaseKill = false;
-  // vldm / vstm limit are 32 for S variants, 16 for D variants.
+static bool isValidLSDoubleOffset(int Offset) {
+  unsigned Value = abs(Offset);
+  // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally
+  // multiplied by 4.
+  return (Value % 4) == 0 && Value < 1024;
+}
 
-  switch (Opcode) {
-  default: break;
-  case ARM::VSTRS:
-    Limit = 32;
-    break;
-  case ARM::VSTRD:
-    Limit = 16;
-    break;
-  case ARM::VLDRD:
-    Limit = 16;
-    break;
-  case ARM::VLDRS:
-    Limit = 32;
-    break;
-  }
+/// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
+void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
+  const MachineInstr *FirstMI = MemOps[0].MI;
+  unsigned Opcode = FirstMI->getOpcode();
+  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
+  unsigned Size = getLSMultipleTransferSize(FirstMI);
+
+  unsigned SIndex = 0;
+  unsigned EIndex = MemOps.size();
+  do {
+    // Look at the first instruction.
+    const MachineInstr *MI = MemOps[SIndex].MI;
+    int Offset = MemOps[SIndex].Offset;
+    const MachineOperand &PMO = getLoadStoreRegOp(*MI);
+    unsigned PReg = PMO.getReg();
+    unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
+    unsigned Latest = SIndex;
+    unsigned Earliest = SIndex;
+    unsigned Count = 1;
+    bool CanMergeToLSDouble =
+      STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset);
+    // ARM errata 602117: LDRD with base in list may result in incorrect base
+    // register when interrupted or faulted.
+    if (STI->isCortexM3() && isi32Load(Opcode) &&
+        PReg == getLoadStoreBaseOp(*MI).getReg())
+      CanMergeToLSDouble = false;
+
+    bool CanMergeToLSMulti = true;
+    // On swift vldm/vstm starting with an odd register number as that needs
+    // more uops than single vldrs.
+    if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
+      CanMergeToLSMulti = false;
+
+    // Merge following instructions where possible.
+    for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
+      int NewOffset = MemOps[I].Offset;
+      if (NewOffset != Offset + (int)Size)
+        break;
+      const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
+      unsigned Reg = MO.getReg();
+      unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
+
+      // See if the current load/store may be part of a multi load/store.
+      bool PartOfLSMulti = CanMergeToLSMulti;
+      if (PartOfLSMulti) {
+        // Cannot load from SP
+        if (Reg == ARM::SP)
+          PartOfLSMulti = false;
+        // Register numbers must be in ascending order.
+        else if (RegNum <= PRegNum)
+          PartOfLSMulti = false;
+        // For VFP / NEON load/store multiples, the registers must be
+        // consecutive and within the limit on the number of registers per
+        // instruction.
+        else if (!isNotVFP && RegNum != PRegNum+1)
+          PartOfLSMulti = false;
+      }
+      // See if the current load/store may be part of a double load/store.
+      bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1;
 
-  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
-    int NewOffset = MemOps[i].Offset;
-    const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
-    unsigned Reg = MO.getReg();
-    unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
-    // Register numbers must be in ascending order. For VFP / NEON load and
-    // store multiples, the registers must also be consecutive and within the
-    // limit on the number of registers per instruction.
-    if (Reg != ARM::SP &&
-        NewOffset == Offset + (int)Size &&
-        ((isNotVFP && RegNum > PRegNum) ||
-         ((Count < Limit) && RegNum == PRegNum+1)) &&
-        // On Swift we don't want vldm/vstm to start with a odd register num
-        // because Q register unaligned vldm/vstm need more uops.
-        (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) {
+      if (!PartOfLSMulti && !PartOfLSDouble)
+        break;
+      CanMergeToLSMulti &= PartOfLSMulti;
+      CanMergeToLSDouble &= PartOfLSDouble;
+      // Track MemOp with latest and earliest position (Positions are
+      // counted in reverse).
+      unsigned Position = MemOps[I].Position;
+      if (Position < MemOps[Latest].Position)
+        Latest = I;
+      else if (Position > MemOps[Earliest].Position)
+        Earliest = I;
+      // Prepare for next MemOp.
       Offset += Size;
       PRegNum = RegNum;
-      ++Count;
-    } else {
-      // Can't merge this in. Try merge the earlier ones first.
-      // We need to compute BaseKill here because the MemOps may have been
-      // reordered.
-      BaseKill = Loc->killsRegister(Base);
-
-      MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, Base,
-                     BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
-      MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
-                   MemOps, Merges);
-      return;
     }
 
-    if (MemOps[i].Position > MemOps[insertAfter].Position) {
-      insertAfter = i;
-      Loc = MemOps[i].MBBI;
-    }
-  }
-
-  BaseKill =  Loc->killsRegister(Base);
-  MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
-                 Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
-}
-
-static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
-                                unsigned Bytes, unsigned Limit,
-                                ARMCC::CondCodes Pred, unsigned PredReg) {
-  unsigned MyPredReg = 0;
-  if (!MI)
-    return false;
-
-  bool CheckCPSRDef = false;
-  switch (MI->getOpcode()) {
-  default: return false;
-  case ARM::tSUBi8:
-  case ARM::t2SUBri:
-  case ARM::SUBri:
-    CheckCPSRDef = true;
-    break;
-  case ARM::tSUBspi:
-    break;
-  }
-
-  // Make sure the offset fits in 8 bits.
-  if (Bytes == 0 || (Limit && Bytes >= Limit))
-    return false;
-
-  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi ||
-                    MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME
-  if (!(MI->getOperand(0).getReg() == Base &&
-        MI->getOperand(1).getReg() == Base &&
-        (MI->getOperand(2).getImm() * Scale) == Bytes &&
-        getInstrPredicate(MI, MyPredReg) == Pred &&
-        MyPredReg == PredReg))
-    return false;
-
-  return CheckCPSRDef ? !definesCPSR(MI) : true;
-}
-
-static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
-                                unsigned Bytes, unsigned Limit,
-                                ARMCC::CondCodes Pred, unsigned PredReg) {
-  unsigned MyPredReg = 0;
-  if (!MI)
-    return false;
-
-  bool CheckCPSRDef = false;
-  switch (MI->getOpcode()) {
-  default: return false;
-  case ARM::tADDi8:
-  case ARM::t2ADDri:
-  case ARM::ADDri:
-    CheckCPSRDef = true;
-    break;
-  case ARM::tADDspi:
-    break;
-  }
-
-  if (Bytes == 0 || (Limit && Bytes >= Limit))
-    // Make sure the offset fits in 8 bits.
-    return false;
-
-  unsigned Scale = (MI->getOpcode() == ARM::tADDspi ||
-                    MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME
-  if (!(MI->getOperand(0).getReg() == Base &&
-        MI->getOperand(1).getReg() == Base &&
-        (MI->getOperand(2).getImm() * Scale) == Bytes &&
-        getInstrPredicate(MI, MyPredReg) == Pred &&
-        MyPredReg == PredReg))
-    return false;
-
-  return CheckCPSRDef ? !definesCPSR(MI) : true;
-}
-
-static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
-  default: return 0;
-  case ARM::LDRi12:
-  case ARM::STRi12:
-  case ARM::tLDRi:
-  case ARM::tSTRi:
-  case ARM::tLDRspi:
-  case ARM::tSTRspi:
-  case ARM::t2LDRi8:
-  case ARM::t2LDRi12:
-  case ARM::t2STRi8:
-  case ARM::t2STRi12:
-  case ARM::VLDRS:
-  case ARM::VSTRS:
-    return 4;
-  case ARM::VLDRD:
-  case ARM::VSTRD:
-    return 8;
-  case ARM::LDMIA:
-  case ARM::LDMDA:
-  case ARM::LDMDB:
-  case ARM::LDMIB:
-  case ARM::STMIA:
-  case ARM::STMDA:
-  case ARM::STMDB:
-  case ARM::STMIB:
-  case ARM::tLDMIA:
-  case ARM::tLDMIA_UPD:
-  case ARM::tSTMIA_UPD:
-  case ARM::t2LDMIA:
-  case ARM::t2LDMDB:
-  case ARM::t2STMIA:
-  case ARM::t2STMDB:
-  case ARM::VLDMSIA:
-  case ARM::VSTMSIA:
-    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
-  case ARM::VLDMDIA:
-  case ARM::VSTMDIA:
-    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
-  }
+    // Form a candidate from the Ops collected so far.
+    MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate;
+    for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C)
+      Candidate->Instrs.push_back(MemOps[C].MI);
+    Candidate->LatestMIIdx = Latest - SIndex;
+    Candidate->EarliestMIIdx = Earliest - SIndex;
+    Candidate->InsertPos = MemOps[Latest].Position;
+    if (Count == 1)
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+    Candidate->CanMergeToLSMulti = CanMergeToLSMulti;
+    Candidate->CanMergeToLSDouble = CanMergeToLSDouble;
+    Candidates.push_back(Candidate);
+    // Continue after the chain.
+    SIndex += Count;
+  } while (SIndex < EIndex);
 }
 
 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
@@ -1081,6 +1060,75 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
   }
 }
 
+/// Check if the given instruction increments or decrements a register and
+/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
+/// generated by the instruction are possibly read as well.
+static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
+                                  ARMCC::CondCodes Pred, unsigned PredReg) {
+  bool CheckCPSRDef;
+  int Scale;
+  switch (MI.getOpcode()) {
+  case ARM::tADDi8:  Scale =  4; CheckCPSRDef = true; break;
+  case ARM::tSUBi8:  Scale = -4; CheckCPSRDef = true; break;
+  case ARM::t2SUBri:
+  case ARM::SUBri:   Scale = -1; CheckCPSRDef = true; break;
+  case ARM::t2ADDri:
+  case ARM::ADDri:   Scale =  1; CheckCPSRDef = true; break;
+  case ARM::tADDspi: Scale =  4; CheckCPSRDef = false; break;
+  case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
+  default: return 0;
+  }
+
+  unsigned MIPredReg;
+  if (MI.getOperand(0).getReg() != Reg ||
+      MI.getOperand(1).getReg() != Reg ||
+      getInstrPredicate(&MI, MIPredReg) != Pred ||
+      MIPredReg != PredReg)
+    return 0;
+
+  if (CheckCPSRDef && definesCPSR(&MI))
+    return 0;
+  return MI.getOperand(2).getImm() * Scale;
+}
+
+/// Searches for an increment or decrement of \p Reg before \p MBBI.
+static MachineBasicBlock::iterator
+findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
+                 ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+  Offset = 0;
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (MBBI == BeginMBBI)
+    return EndMBBI;
+
+  // Skip debug values.
+  MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+  while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
+    --PrevMBBI;
+
+  Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
+  return Offset == 0 ? EndMBBI : PrevMBBI;
+}
+
+/// Searches for a increment or decrement of \p Reg after \p MBBI.
+static MachineBasicBlock::iterator
+findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
+                ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+  Offset = 0;
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+  // Skip debug values.
+  while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+    ++NextMBBI;
+  if (NextMBBI == EndMBBI)
+    return EndMBBI;
+
+  Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
+  return Offset == 0 ? EndMBBI : NextMBBI;
+}
+
 /// Fold proceeding/trailing inc/dec of base register into the
 /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
 ///
@@ -1093,21 +1141,17 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
 /// ldmia rn, <ra, rb, rc>
 /// =>
 /// ldmdb rn!, <ra, rb, rc>
-bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator MBBI,
-                                               bool &Advance,
-                                               MachineBasicBlock::iterator &I) {
+bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   // Thumb1 is already using updating loads/stores.
   if (isThumb1) return false;
 
-  MachineInstr *MI = MBBI;
-  unsigned Base = MI->getOperand(0).getReg();
-  bool BaseKill = MI->getOperand(0).isKill();
-  unsigned Bytes = getLSMultipleTransferSize(MI);
+  const MachineOperand &BaseOP = MI->getOperand(0);
+  unsigned Base = BaseOP.getReg();
+  bool BaseKill = BaseOP.isKill();
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   unsigned Opcode = MI->getOpcode();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc DL = MI->getDebugLoc();
 
   // Can't use an updating ld/st if the base register is also a dest
   // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
@@ -1115,55 +1159,27 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
     if (MI->getOperand(i).getReg() == Base)
       return false;
 
-  bool DoMerge = false;
+  int Bytes = getLSMultipleTransferSize(MI);
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineBasicBlock::iterator MBBI(MI);
+  int Offset;
+  MachineBasicBlock::iterator MergeInstr
+    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
   ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
-
-  // Try merging with the previous instruction.
-  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
-  if (MBBI != BeginMBBI) {
-    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
-    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
-      --PrevMBBI;
-    if (Mode == ARM_AM::ia &&
-        isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
-      Mode = ARM_AM::db;
-      DoMerge = true;
-    } else if (Mode == ARM_AM::ib &&
-               isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
-      Mode = ARM_AM::da;
-      DoMerge = true;
-    }
-    if (DoMerge)
-      MBB.erase(PrevMBBI);
-  }
-
-  // Try merging with the next instruction.
-  MachineBasicBlock::iterator EndMBBI = MBB.end();
-  if (!DoMerge && MBBI != EndMBBI) {
-    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
-    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
-      ++NextMBBI;
-    if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
-        isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
-      DoMerge = true;
-    } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
-               isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
-      DoMerge = true;
-    }
-    if (DoMerge) {
-      if (NextMBBI == I) {
-        Advance = true;
-        ++I;
-      }
-      MBB.erase(NextMBBI);
-    }
+  if (Mode == ARM_AM::ia && Offset == -Bytes) {
+    Mode = ARM_AM::db;
+  } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
+    Mode = ARM_AM::da;
+  } else {
+    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+    if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
+        ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes))
+      return false;
   }
-
-  if (!DoMerge)
-    return false;
+  MBB.erase(MergeInstr);
 
   unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
-  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
     .addReg(Base, getDefRegState(true)) // WB base register
     .addReg(Base, getKillRegState(BaseKill))
     .addImm(Pred).addReg(PredReg);
@@ -1231,21 +1247,15 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
 
 /// Fold proceeding/trailing inc/dec of base register into the
 /// LDR/STR/FLD{D|S}/FST{D|S} op when possible:
-bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator MBBI,
-                                               const TargetInstrInfo *TII,
-                                               bool &Advance,
-                                               MachineBasicBlock::iterator &I) {
+bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
   // Thumb1 doesn't have updating LDR/STR.
   // FIXME: Use LDM/STM with single register instead.
   if (isThumb1) return false;
 
-  MachineInstr *MI = MBBI;
-  unsigned Base = MI->getOperand(1).getReg();
-  bool BaseKill = MI->getOperand(1).isKill();
-  unsigned Bytes = getLSMultipleTransferSize(MI);
+  unsigned Base = getLoadStoreBaseOp(*MI).getReg();
+  bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
   unsigned Opcode = MI->getOpcode();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc DL = MI->getDebugLoc();
   bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
                 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
   bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
@@ -1255,7 +1265,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
     return false;
 
-  bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
   // Can't do the merge if the destination register is the same as the would-be
   // writeback register.
   if (MI->getOperand(0).getReg() == Base)
@@ -1263,64 +1272,38 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
 
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
-  bool DoMerge = false;
-  ARM_AM::AddrOpc AddSub = ARM_AM::add;
-  unsigned NewOpc = 0;
-  // AM2 - 12 bits, thumb2 - 8 bits.
-  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
-
-  // Try merging with the previous instruction.
-  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
-  if (MBBI != BeginMBBI) {
-    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
-    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
-      --PrevMBBI;
-    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
-      DoMerge = true;
-      AddSub = ARM_AM::sub;
-    } else if (!isAM5 &&
-               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
-      DoMerge = true;
-    }
-    if (DoMerge) {
-      NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
-      MBB.erase(PrevMBBI);
-    }
-  }
-
-  // Try merging with the next instruction.
-  MachineBasicBlock::iterator EndMBBI = MBB.end();
-  if (!DoMerge && MBBI != EndMBBI) {
-    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
-    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
-      ++NextMBBI;
-    if (!isAM5 &&
-        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
-      DoMerge = true;
-      AddSub = ARM_AM::sub;
-    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
-      DoMerge = true;
-    }
-    if (DoMerge) {
-      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
-      if (NextMBBI == I) {
-        Advance = true;
-        ++I;
-      }
-      MBB.erase(NextMBBI);
-    }
+  int Bytes = getLSMultipleTransferSize(MI);
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineBasicBlock::iterator MBBI(MI);
+  int Offset;
+  MachineBasicBlock::iterator MergeInstr
+    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
+  unsigned NewOpc;
+  if (!isAM5 && Offset == Bytes) {
+    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+  } else if (Offset == -Bytes) {
+    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+  } else {
+    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+    if (Offset == Bytes) {
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+    } else if (!isAM5 && Offset == -Bytes) {
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+    } else
+      return false;
   }
+  MBB.erase(MergeInstr);
 
-  if (!DoMerge)
-    return false;
+  ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
 
+  bool isLd = isLoadSingle(Opcode);
   if (isAM5) {
     // VLDM[SD]_UPD, VSTM[SD]_UPD
     // (There are no base-updating versions of VLDR/VSTR instructions, but the
     // updating load/store-multiple instructions can be used with only one
     // register.)
     MachineOperand &MO = MI->getOperand(0);
-    BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
+    BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
       .addReg(Base, getDefRegState(true)) // WB base register
       .addReg(Base, getKillRegState(isLd ? BaseKill : false))
       .addImm(Pred).addReg(PredReg)
@@ -1330,20 +1313,18 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
     if (isAM2) {
       // LDR_PRE, LDR_POST
       if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
-        int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
-        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
           .addReg(Base, RegState::Define)
           .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
       } else {
-        int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
-        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
           .addReg(Base, RegState::Define)
-          .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+          .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
       }
     } else {
-      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2LDR_PRE, t2LDR_POST
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
         .addReg(Base, RegState::Define)
         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
     }
@@ -1353,15 +1334,14 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
     // the vestigal zero-reg offset register. When that's fixed, this clause
     // can be removed entirely.
     if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
-      int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+      int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
       // STR_PRE, STR_POST
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
-        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+        .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
     } else {
-      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2STR_PRE, t2STR_POST
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
     }
@@ -1371,6 +1351,66 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   return true;
 }
 
+bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
+         "Must have t2STRDi8 or t2LDRDi8");
+  if (MI.getOperand(3).getImm() != 0)
+    return false;
+
+  // Behaviour for writeback is undefined if base register is the same as one
+  // of the others.
+  const MachineOperand &BaseOp = MI.getOperand(2);
+  unsigned Base = BaseOp.getReg();
+  const MachineOperand &Reg0Op = MI.getOperand(0);
+  const MachineOperand &Reg1Op = MI.getOperand(1);
+  if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
+    return false;
+
+  unsigned PredReg;
+  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
+  MachineBasicBlock::iterator MBBI(MI);
+  MachineBasicBlock &MBB = *MI.getParent();
+  int Offset;
+  MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
+                                                            PredReg, Offset);
+  unsigned NewOpc;
+  if (Offset == 8 || Offset == -8) {
+    NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
+  } else {
+    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+    if (Offset == 8 || Offset == -8) {
+      NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
+    } else
+      return false;
+  }
+  MBB.erase(MergeInstr);
+
+  DebugLoc DL = MI.getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
+  if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
+    MIB.addOperand(Reg0Op).addOperand(Reg1Op)
+       .addReg(BaseOp.getReg(), RegState::Define);
+  } else {
+    assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
+    MIB.addReg(BaseOp.getReg(), RegState::Define)
+       .addOperand(Reg0Op).addOperand(Reg1Op);
+  }
+  MIB.addReg(BaseOp.getReg(), RegState::Kill)
+     .addImm(Offset).addImm(Pred).addReg(PredReg);
+  assert(TII->get(Opcode).getNumOperands() == 6 &&
+         TII->get(NewOpc).getNumOperands() == 7 &&
+         "Unexpected number of operands in Opcode specification.");
+
+  // Transfer implicit operands.
+  for (const MachineOperand &MO : MI.implicit_operands())
+    MIB.addOperand(MO);
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MBB.erase(MBBI);
+  return true;
+}
+
 /// Returns true if instruction is a memory operation that this pass is capable
 /// of operating on.
 static bool isMemoryOp(const MachineInstr *MI) {
@@ -1426,26 +1466,10 @@ static bool isMemoryOp(const MachineInstr *MI) {
   return false;
 }
 
-/// Advance register scavenger to just before the earliest memory op that is
-/// being merged.
-void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
-  MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
-  unsigned Position = MemOps[0].Position;
-  for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
-    if (MemOps[i].Position < Position) {
-      Position = MemOps[i].Position;
-      Loc = MemOps[i].MBBI;
-    }
-  }
-
-  if (Loc != MBB.begin())
-    RS->forward(std::prev(Loc));
-}
-
 static void InsertLDR_STR(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator &MBBI,
                           int Offset, bool isDef,
-                          DebugLoc dl, unsigned NewOpc,
+                          DebugLoc DL, unsigned NewOpc,
                           unsigned Reg, bool RegDeadKill, bool RegUndef,
                           unsigned BaseReg, bool BaseKill, bool BaseUndef,
                           bool OffKill, bool OffUndef,
@@ -1491,7 +1515,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
   if (!Errata602117 && !NonConsecutiveRegs)
     return false;
 
-  MachineBasicBlock::iterator NewBBI = MBBI;
   bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
   bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
   bool EvenDeadKill = isLd ?
@@ -1531,7 +1554,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                 getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
       ++NumSTRD2STM;
     }
-    NewBBI = std::prev(MBBI);
   } else {
     // Split into two instructions.
     unsigned NewOpc = (isLd)
@@ -1553,7 +1575,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                     OddReg, OddDeadKill, false,
                     BaseReg, false, BaseUndef, false, OffUndef,
                     Pred, PredReg, TII, isT2);
-      NewBBI = std::prev(MBBI);
       InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
                     EvenReg, EvenDeadKill, false,
                     BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
@@ -1573,7 +1594,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                     EvenReg, EvenDeadKill, EvenUndef,
                     BaseReg, false, BaseUndef, false, OffUndef,
                     Pred, PredReg, TII, isT2);
-      NewBBI = std::prev(MBBI);
       InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
                     OddReg, OddDeadKill, OddUndef,
                     BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
@@ -1585,191 +1605,160 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
       ++NumSTRD2STR;
   }
 
-  MBB.erase(MI);
-  MBBI = NewBBI;
+  MBBI = MBB.erase(MBBI);
   return true;
 }
 
 /// An optimization pass to turn multiple LDR / STR ops of the same base and
 /// incrementing offset into LDM / STM ops.
 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
-  unsigned NumMerges = 0;
-  unsigned NumMemOps = 0;
   MemOpQueue MemOps;
   unsigned CurrBase = 0;
   unsigned CurrOpc = ~0u;
-  unsigned CurrSize = 0;
   ARMCC::CondCodes CurrPred = ARMCC::AL;
-  unsigned CurrPredReg = 0;
   unsigned Position = 0;
-  SmallVector<MachineBasicBlock::iterator,4> Merges;
-
-  RS->enterBasicBlock(&MBB);
-  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-  while (MBBI != E) {
+  assert(Candidates.size() == 0);
+  assert(MergeBaseCandidates.size() == 0);
+  LiveRegsValid = false;
+
+  for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
+       I = MBBI) {
+    // The instruction in front of the iterator is the one we look at.
+    MBBI = std::prev(I);
     if (FixInvalidRegPairOp(MBB, MBBI))
       continue;
+    ++Position;
 
-    bool Advance  = false;
-    bool TryMerge = false;
-
-    bool isMemOp = isMemoryOp(MBBI);
-    if (isMemOp) {
+    if (isMemoryOp(MBBI)) {
       unsigned Opcode = MBBI->getOpcode();
-      unsigned Size = getLSMultipleTransferSize(MBBI);
       const MachineOperand &MO = MBBI->getOperand(0);
       unsigned Reg = MO.getReg();
-      bool isKill = MO.isDef() ? false : MO.isKill();
-      unsigned Base = MBBI->getOperand(1).getReg();
+      unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
       unsigned PredReg = 0;
       ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
       int Offset = getMemoryOpOffset(MBBI);
-      // Watch out for:
-      // r4 := ldr [r5]
-      // r5 := ldr [r5, #4]
-      // r6 := ldr [r5, #8]
-      //
-      // The second ldr has effectively broken the chain even though it
-      // looks like the later ldr(s) use the same base register. Try to
-      // merge the ldr's so far, including this one. But don't try to
-      // combine the following ldr(s).
-      bool Clobber = isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg();
-
-      // Watch out for:
-      // r4 := ldr [r0, #8]
-      // r4 := ldr [r0, #4]
-      //
-      // The optimization may reorder the second ldr in front of the first
-      // ldr, which violates write after write(WAW) dependence. The same as
-      // str. Try to merge inst(s) already in MemOps.
-      bool Overlap = false;
-      for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); I != E; ++I) {
-        if (TRI->regsOverlap(Reg, I->MBBI->getOperand(0).getReg())) {
-          Overlap = true;
-          break;
-        }
-      }
-
-      if (CurrBase == 0 && !Clobber) {
+      if (CurrBase == 0) {
         // Start of a new chain.
         CurrBase = Base;
         CurrOpc  = Opcode;
-        CurrSize = Size;
         CurrPred = Pred;
-        CurrPredReg = PredReg;
-        MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI));
-        ++NumMemOps;
-        Advance = true;
-      } else if (!Overlap) {
-        if (Clobber) {
-          TryMerge = true;
-          Advance = true;
+        MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
+        continue;
+      }
+      // Note: No need to match PredReg in the next if.
+      if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
+        // Watch out for:
+        //   r4 := ldr [r0, #8]
+        //   r4 := ldr [r0, #4]
+        // or
+        //   r0 := ldr [r0]
+        // If a load overrides the base register or a register loaded by
+        // another load in our chain, we cannot take this instruction.
+        bool Overlap = false;
+        if (isLoadSingle(Opcode)) {
+          Overlap = (Base == Reg);
+          if (!Overlap) {
+            for (const MemOpQueueEntry &E : MemOps) {
+              if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) {
+                Overlap = true;
+                break;
+              }
+            }
+          }
         }
 
-        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
-          // No need to match PredReg.
-          // Continue adding to the queue.
+        if (!Overlap) {
+          // Check offset and sort memory operation into the current chain.
           if (Offset > MemOps.back().Offset) {
-            MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill,
-                                             Position, MBBI));
-            ++NumMemOps;
-            Advance = true;
+            MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
+            continue;
           } else {
-            for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
-                 I != E; ++I) {
-              if (Offset < I->Offset) {
-                MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill,
-                                                 Position, MBBI));
-                ++NumMemOps;
-                Advance = true;
+            MemOpQueue::iterator MI, ME;
+            for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) {
+              if (Offset < MI->Offset) {
+                // Found a place to insert.
                 break;
-              } else if (Offset == I->Offset) {
-                // Collision! This can't be merged!
+              }
+              if (Offset == MI->Offset) {
+                // Collision, abort.
+                MI = ME;
                 break;
               }
             }
+            if (MI != MemOps.end()) {
+              MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position));
+              continue;
+            }
           }
         }
       }
-    }
 
-    if (MBBI->isDebugValue()) {
-      ++MBBI;
-      if (MBBI == E)
-        // Reach the end of the block, try merging the memory instructions.
-        TryMerge = true;
-    } else if (Advance) {
-      ++Position;
-      ++MBBI;
-      if (MBBI == E)
-        // Reach the end of the block, try merging the memory instructions.
-        TryMerge = true;
-    } else {
-      TryMerge = true;
+      // Don't advance the iterator; The op will start a new chain next.
+      MBBI = I;
+      --Position;
+      // Fallthrough to look into existing chain.
+    } else if (MBBI->isDebugValue()) {
+      continue;
+    } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
+               MBBI->getOpcode() == ARM::t2STRDi8) {
+      // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
+      // remember them because we may still be able to merge add/sub into them.
+      MergeBaseCandidates.push_back(MBBI);
     }
 
-    if (TryMerge) {
-      if (NumMemOps > 1) {
-        // Try to find a free register to use as a new base in case it's needed.
-        // First advance to the instruction just before the start of the chain.
-        AdvanceRS(MBB, MemOps);
-
-        // Find a scratch register.
-        unsigned Scratch =
-          RS->FindUnusedReg(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass);
-
-        // Process the load / store instructions.
-        RS->forward(std::prev(MBBI));
-
-        // Merge ops.
-        Merges.clear();
-        MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
-                     CurrPred, CurrPredReg, Scratch, MemOps, Merges);
-
-        // Try folding preceding/trailing base inc/dec into the generated
-        // LDM/STM ops.
-        for (unsigned i = 0, e = Merges.size(); i < e; ++i)
-          if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
-            ++NumMerges;
-        NumMerges += Merges.size();
-
-        // Try folding preceding/trailing base inc/dec into those load/store
-        // that were not merged to form LDM/STM ops.
-        for (unsigned i = 0; i != NumMemOps; ++i)
-          if (!MemOps[i].Merged)
-            if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
-              ++NumMerges;
-
-        // RS may be pointing to an instruction that's deleted.
-        RS->skipTo(std::prev(MBBI));
-      } else if (NumMemOps == 1) {
-        // Try folding preceding/trailing base inc/dec into the single
-        // load/store.
-        if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
-          ++NumMerges;
-          RS->forward(std::prev(MBBI));
-        }
-      }
 
+    // If we are here then the chain is broken; Extract candidates for a merge.
+    if (MemOps.size() > 0) {
+      FormCandidates(MemOps);
+      // Reset for the next chain.
       CurrBase = 0;
       CurrOpc = ~0u;
-      CurrSize = 0;
       CurrPred = ARMCC::AL;
-      CurrPredReg = 0;
-      if (NumMemOps) {
-        MemOps.clear();
-        NumMemOps = 0;
-      }
+      MemOps.clear();
+    }
+  }
+  if (MemOps.size() > 0)
+    FormCandidates(MemOps);
 
-      // If iterator hasn't been advanced and this is not a memory op, skip it.
-      // It can't start a new chain anyway.
-      if (!Advance && !isMemOp && MBBI != E) {
-        ++Position;
-        ++MBBI;
+  // Sort candidates so they get processed from end to begin of the basic
+  // block later; This is necessary for liveness calculation.
+  auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
+    return M0->InsertPos < M1->InsertPos;
+  };
+  std::sort(Candidates.begin(), Candidates.end(), LessThan);
+
+  // Go through list of candidates and merge.
+  bool Changed = false;
+  for (const MergeCandidate *Candidate : Candidates) {
+    if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) {
+      MachineInstr *Merged = MergeOpsUpdate(*Candidate);
+      // Merge preceding/trailing base inc/dec into the merged op.
+      if (Merged) {
+        Changed = true;
+        unsigned Opcode = Merged->getOpcode();
+        if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
+          MergeBaseUpdateLSDouble(*Merged);
+        else
+          MergeBaseUpdateLSMultiple(Merged);
+      } else {
+        for (MachineInstr *MI : Candidate->Instrs) {
+          if (MergeBaseUpdateLoadStore(MI))
+            Changed = true;
+        }
       }
+    } else {
+      assert(Candidate->Instrs.size() == 1);
+      if (MergeBaseUpdateLoadStore(Candidate->Instrs.front()))
+        Changed = true;
     }
   }
-  return NumMerges > 0;
+  Candidates.clear();
+  // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
+  for (MachineInstr *MI : MergeBaseCandidates)
+    MergeBaseUpdateLSDouble(*MI);
+  MergeBaseCandidates.clear();
+
+  return Changed;
 }
 
 /// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr")
@@ -1814,12 +1803,14 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
 }
 
 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  MF = &Fn;
   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
   TL = STI->getTargetLowering();
   AFI = Fn.getInfo<ARMFunctionInfo>();
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
-  RS = new RegScavenger();
+  MRI = &Fn.getRegInfo();
+  RegClassInfoValid = false;
   isThumb2 = AFI->isThumb2Function();
   isThumb1 = AFI->isThumbFunction() && !isThumb2;
 
@@ -1832,7 +1823,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
       Modified |= MergeReturnIntoLDM(MBB);
   }
 
-  delete RS;
+  Allocator.DestroyAll();
   return Modified;
 }
 
@@ -2219,7 +2210,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
         continue;
 
       int Opc = MI->getOpcode();
-      bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
+      bool isLd = isLoadSingle(Opc);
       unsigned Base = MI->getOperand(1).getReg();
       int Offset = getMemoryOpOffset(MI);
 
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index a59cf9851108..6cafbbb9f8eb 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -18,12 +18,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "arm-selectiondag-info"
 
-ARMSelectionDAGInfo::ARMSelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-ARMSelectionDAGInfo::~ARMSelectionDAGInfo() {
-}
-
 // Emit, if possible, a specialized version of the given Libcall. Typically this
 // means selecting the appropriately aligned version, but we also convert memset
 // of 0 into memclr.
@@ -83,7 +77,7 @@ EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl,
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  Entry.Ty = TLI->getDataLayout()->getIntPtrType(*DAG.getContext());
+  Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
   Entry.Node = Dst;
   Args.push_back(Entry);
   if (AEABILibcall == AEABI_MEMCLR) {
@@ -121,12 +115,14 @@ EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl,
     { "__aeabi_memclr",  "__aeabi_memclr4",  "__aeabi_memclr8"  }
   };
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(TLI->getLibcallCallingConv(LC),
-               Type::getVoidTy(*DAG.getContext()),
-               DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
-                                     TLI->getPointerTy()), std::move(Args), 0)
-    .setDiscardResult();
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setCallee(
+           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
+           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
+                                 TLI->getPointerTy(DAG.getDataLayout())),
+           std::move(Args), 0)
+      .setDiscardResult();
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   
   return CallResult.second;
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 1db190f41e1a..289879ee1d7e 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -37,8 +37,6 @@ namespace ARM_AM {
 
 class ARMSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit ARMSelectionDAGInfo(const DataLayout &DL);
-  ~ARMSelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 55808dfb9efe..002c3e9b6291 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -112,7 +112,6 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
     : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
       ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle),
       TargetTriple(TT), Options(TM.Options), TM(TM),
-      TSInfo(*TM.getDataLayout()),
       FrameLowering(initializeFrameLowering(CPU, FS)),
       // At this point initializeSubtargetDependencies has been called so
       // we can query directly.
@@ -172,6 +171,7 @@ void ARMSubtarget::initializeEnvironment() {
   AllowsUnalignedMem = false;
   Thumb2DSP = false;
   UseNaClTrap = false;
+  GenLongCalls = false;
   UnsafeFPMath = false;
 }
 
@@ -286,7 +286,7 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
   if (RelocM == Reloc::Static)
     return false;
 
-  bool isDecl = GV->isDeclarationForLinker();
+  bool isDef = GV->isStrongDefinitionForLinker();
 
   if (!isTargetMachO()) {
     // Extra load is needed for all externally visible.
@@ -294,34 +294,22 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
       return false;
     return true;
   } else {
-    if (RelocM == Reloc::PIC_) {
-      // If this is a strong reference to a definition, it is definitely not
-      // through a stub.
-      if (!isDecl && !GV->isWeakForLinker())
-        return false;
-
-      // Unless we have a symbol with hidden visibility, we have to go through a
-      // normal $non_lazy_ptr stub because this symbol might be resolved late.
-      if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
-        return true;
+    // If this is a strong reference to a definition, it is definitely not
+    // through a stub.
+    if (isDef)
+      return false;
+
+    // Unless we have a symbol with hidden visibility, we have to go through a
+    // normal $non_lazy_ptr stub because this symbol might be resolved late.
+    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+      return true;
 
+    if (RelocM == Reloc::PIC_) {
       // If symbol visibility is hidden, we have a stub for common symbol
       // references and external declarations.
-      if (isDecl || GV->hasCommonLinkage())
+      if (GV->isDeclarationForLinker() || GV->hasCommonLinkage())
         // Hidden $non_lazy_ptr reference.
         return true;
-
-      return false;
-    } else {
-      // If this is a strong reference to a definition, it is definitely not
-      // through a stub.
-      if (!isDecl && !GV->isWeakForLinker())
-        return false;
-
-      // Unless we have a symbol with hidden visibility, we have to go through a
-      // normal $non_lazy_ptr stub because this symbol might be resolved late.
-      if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
-        return true;
     }
   }
 
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 9909a6a6d198..dd101df9b63d 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -206,6 +206,9 @@ protected:
   /// NaCl TRAP instruction is generated instead of the regular TRAP.
   bool UseNaClTrap;
 
+  /// Generate calls via indirect call instructions.
+  bool GenLongCalls;
+
   /// Target machine allowed unsafe FP math (such as use of NEON fp)
   bool UnsafeFPMath;
 
@@ -342,6 +345,7 @@ public:
   bool hasMPExtension() const { return HasMPExtension; }
   bool hasThumb2DSP() const { return Thumb2DSP; }
   bool useNaClTrap() const { return UseNaClTrap; }
+  bool genLongCalls() const { return GenLongCalls; }
 
   bool hasFP16() const { return HasFP16; }
   bool hasD16() const { return HasD16; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 6e81bd2d349d..93495d66ae70 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -80,8 +80,7 @@ computeTargetABI(const Triple &TT, StringRef CPU,
   // FIXME: This is duplicated code from the front end and should be unified.
   if (TT.isOSBinFormatMachO()) {
     if (TT.getEnvironment() == llvm::Triple::EABI ||
-        (TT.getOS() == llvm::Triple::UnknownOS &&
-         TT.getObjectFormat() == llvm::Triple::MachO) ||
+        (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
         CPU.startswith("cortex-m")) {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
     } else {
@@ -104,8 +103,8 @@ computeTargetABI(const Triple &TT, StringRef CPU,
       TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
       break;
     default:
-      if (TT.getOS() == llvm::Triple::NetBSD)
-	TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
+      if (TT.isOSNetBSD())
+        TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
       else
 	TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
       break;
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index f4901fc24e44..2f194cf7ae06 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -61,14 +61,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
 
   if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
                                           ISD == ISD::FP_EXTEND)) {
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
     int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second);
     if (Idx != -1)
       return LT.first * NEONFltDblTbl[Idx].Cost;
   }
 
-  EVT SrcTy = TLI->getValueType(Src);
-  EVT DstTy = TLI->getValueType(Dst);
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return BaseT::getCastInstrCost(Opcode, Dst, Src);
@@ -282,8 +282,8 @@ unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
     };
 
-    EVT SelCondTy = TLI->getValueType(CondTy);
-    EVT SelValTy = TLI->getValueType(ValTy);
+    EVT SelCondTy = TLI->getValueType(DL, CondTy);
+    EVT SelValTy = TLI->getValueType(DL, ValTy);
     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
       int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
                                        SelCondTy.getSimpleVT(),
@@ -292,7 +292,7 @@ unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
         return NEONVectorSelectTbl[Idx].Cost;
     }
 
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
     return LT.first;
   }
 
@@ -353,7 +353,7 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
         {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
         {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
 
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
     int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
     if (Idx == -1)
@@ -379,7 +379,7 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 
         {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
 
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
     int Idx =
         CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
     if (Idx == -1)
@@ -395,7 +395,7 @@ unsigned ARMTTIImpl::getArithmeticInstrCost(
     TTI::OperandValueProperties Opd2PropInfo) {
 
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   const unsigned FunctionCallDivCost = 20;
   const unsigned ReciprocalDivCost = 10;
@@ -468,7 +468,7 @@ unsigned ARMTTIImpl::getArithmeticInstrCost(
 unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                      unsigned Alignment,
                                      unsigned AddressSpace) {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 
   if (Src->isVectorTy() && Alignment != 16 &&
       Src->getVectorElementType()->isDoubleTy()) {
@@ -488,12 +488,12 @@ unsigned ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
   // vldN/vstN doesn't support vector types of i64/f64 element.
-  bool EltIs64Bits = DL->getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;
+  bool EltIs64Bits = DL.getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
     unsigned NumElts = VecTy->getVectorNumElements();
     Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = TLI->getDataLayout()->getTypeAllocSize(SubVecTy);
+    unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy);
 
     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
     if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index f2e5db655ccf..84f256f73722 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -42,7 +42,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
 
 public:
   explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, Function &F)
-      : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   ARMTTIImpl(const ARMTTIImpl &Arg)
@@ -50,18 +51,6 @@ public:
   ARMTTIImpl(ARMTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  ARMTTIImpl &operator=(const ARMTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  ARMTTIImpl &operator=(ARMTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   /// \name Scalar TTI Implementations
   /// @{
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index c2db74619871..f8f0eb2d4baa 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -189,9 +189,9 @@ class ARMAsmParser : public MCTargetAsmParser {
     return getParser().Error(L, Msg, Ranges);
   }
 
-  bool validatetLDMRegList(MCInst Inst, const OperandVector &Operands,
+  bool validatetLDMRegList(const MCInst &Inst, const OperandVector &Operands,
                            unsigned ListNo, bool IsARPop = false);
-  bool validatetSTMRegList(MCInst Inst, const OperandVector &Operands,
+  bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands,
                            unsigned ListNo);
 
   int tryParseRegister();
@@ -242,6 +242,8 @@ class ARMAsmParser : public MCTargetAsmParser {
                              bool &CanAcceptCarrySet,
                              bool &CanAcceptPredicationCode);
 
+  void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting,
+                                     OperandVector &Operands);
   bool isThumb() const {
     // FIXME: Can tablegen auto-generate this?
     return STI.getFeatureBits()[ARM::ModeThumb];
@@ -5465,6 +5467,92 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
     CanAcceptPredicationCode = true;
 }
 
+// \brief Some Thumb instructions have two operand forms that are not
+// available as three operand, convert to two operand form if possible.
+//
+// FIXME: We would really like to be able to tablegen'erate this.
+void ARMAsmParser::tryConvertingToTwoOperandForm(StringRef Mnemonic,
+                                                 bool CarrySetting,
+                                                 OperandVector &Operands) {
+  if (Operands.size() != 6)
+    return;
+
+  const auto &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+        auto &Op4 = static_cast<ARMOperand &>(*Operands[4]);
+  if (!Op3.isReg() || !Op4.isReg())
+    return;
+
+  auto Op3Reg = Op3.getReg();
+  auto Op4Reg = Op4.getReg();
+
+  // For most Thumb2 cases we just generate the 3 operand form and reduce
+  // it in processInstruction(), but the 3 operand form of ADD (t2ADDrr)
+  // won't accept SP or PC so we do the transformation here taking care
+  // with immediate range in the 'add sp, sp #imm' case.
+  auto &Op5 = static_cast<ARMOperand &>(*Operands[5]);
+  if (isThumbTwo()) {
+    if (Mnemonic != "add")
+      return;
+    bool TryTransform = Op3Reg == ARM::PC || Op4Reg == ARM::PC ||
+                        (Op5.isReg() && Op5.getReg() == ARM::PC);
+    if (!TryTransform) {
+      TryTransform = (Op3Reg == ARM::SP || Op4Reg == ARM::SP ||
+                      (Op5.isReg() && Op5.getReg() == ARM::SP)) &&
+                     !(Op3Reg == ARM::SP && Op4Reg == ARM::SP &&
+                       Op5.isImm() && !Op5.isImm0_508s4());
+    }
+    if (!TryTransform)
+      return;
+  } else if (!isThumbOne())
+    return;
+
+  if (!(Mnemonic == "add" || Mnemonic == "sub" || Mnemonic == "and" ||
+        Mnemonic == "eor" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
+        Mnemonic == "asr" || Mnemonic == "adc" || Mnemonic == "sbc" ||
+        Mnemonic == "ror" || Mnemonic == "orr" || Mnemonic == "bic"))
+    return;
+
+  // If first 2 operands of a 3 operand instruction are the same
+  // then transform to 2 operand version of the same instruction
+  // e.g. 'adds r0, r0, #1' transforms to 'adds r0, #1'
+  bool Transform = Op3Reg == Op4Reg;
+
+  // For communtative operations, we might be able to transform if we swap
+  // Op4 and Op5.  The 'ADD Rdm, SP, Rdm' form is already handled specially
+  // as tADDrsp.
+  const ARMOperand *LastOp = &Op5;
+  bool Swap = false;
+  if (!Transform && Op5.isReg() && Op3Reg == Op5.getReg() &&
+      ((Mnemonic == "add" && Op4Reg != ARM::SP) ||
+       Mnemonic == "and" || Mnemonic == "eor" ||
+       Mnemonic == "adc" || Mnemonic == "orr")) {
+    Swap = true;
+    LastOp = &Op4;
+    Transform = true;
+  }
+
+  // If both registers are the same then remove one of them from
+  // the operand list, with certain exceptions.
+  if (Transform) {
+    // Don't transform 'adds Rd, Rd, Rm' or 'sub{s} Rd, Rd, Rm' because the
+    // 2 operand forms don't exist.
+    if (((Mnemonic == "add" && CarrySetting) || Mnemonic == "sub") &&
+        LastOp->isReg())
+      Transform = false;
+
+    // Don't transform 'add/sub{s} Rd, Rd, #imm' if the immediate fits into
+    // 3-bits because the ARMARM says not to.
+    if ((Mnemonic == "add" || Mnemonic == "sub") && LastOp->isImm0_7())
+      Transform = false;
+  }
+
+  if (Transform) {
+    if (Swap)
+      std::swap(Op4, Op5);
+    Operands.erase(Operands.begin() + 3);
+  }
+}
+
 bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
                                           OperandVector &Operands) {
   // FIXME: This is all horribly hacky. We really need a better way to deal
@@ -5838,6 +5926,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                    "VFP/Neon double precision register expected");
   }
 
+  tryConvertingToTwoOperandForm(Mnemonic, CarrySetting, Operands);
+
   // Some instructions, mostly Thumb, have forms for the same mnemonic that
   // do and don't have a cc_out optional-def operand. With some spot-checks
   // of the operand list, we can figure out which variant we're trying to
@@ -5901,48 +5991,6 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     }
   }
 
-  // If first 2 operands of a 3 operand instruction are the same
-  // then transform to 2 operand version of the same instruction
-  // e.g. 'adds r0, r0, #1' transforms to 'adds r0, #1'
-  // FIXME: We would really like to be able to tablegen'erate this.
-  if (isThumbOne() && Operands.size() == 6 &&
-       (Mnemonic == "add" || Mnemonic == "sub" || Mnemonic == "and" ||
-        Mnemonic == "eor" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
-        Mnemonic == "asr" || Mnemonic == "adc" || Mnemonic == "sbc" ||
-        Mnemonic == "ror" || Mnemonic == "orr" || Mnemonic == "bic")) {
-      ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
-      ARMOperand &Op4 = static_cast<ARMOperand &>(*Operands[4]);
-      ARMOperand &Op5 = static_cast<ARMOperand &>(*Operands[5]);
-
-      // If both registers are the same then remove one of them from
-      // the operand list.
-      if (Op3.isReg() && Op4.isReg() && Op3.getReg() == Op4.getReg()) {
-          // If 3rd operand (variable Op5) is a register and the instruction is adds/sub
-          // then do not transform as the backend already handles this instruction
-          // correctly.
-          if (!Op5.isReg() || !((Mnemonic == "add" && CarrySetting) || Mnemonic == "sub")) {
-              Operands.erase(Operands.begin() + 3);
-              if (Mnemonic == "add" && !CarrySetting) {
-                  // Special case for 'add' (not 'adds') instruction must
-                  // remove the CCOut operand as well.
-                  Operands.erase(Operands.begin() + 1);
-              }
-          }
-      }
-  }
-
-  // If instruction is 'add' and first two register operands
-  // use SP register, then remove one of the SP registers from
-  // the instruction.
-  // FIXME: We would really like to be able to tablegen'erate this.
-  if (isThumbOne() && Operands.size() == 5 && Mnemonic == "add" && !CarrySetting) {
-      ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
-      ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
-      if (Op2.isReg() && Op3.isReg() && Op2.getReg() == ARM::SP && Op3.getReg() == ARM::SP) {
-          Operands.erase(Operands.begin() + 2);
-      }
-  }
-
   // GNU Assembler extension (compatibility)
   if ((Mnemonic == "ldrd" || Mnemonic == "strd")) {
     ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
@@ -5985,8 +6033,9 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 // return 'true' if register list contains non-low GPR registers,
 // 'false' otherwise. If Reg is in the register list or is HiReg, set
 // 'containsReg' to true.
-static bool checkLowRegisterList(MCInst Inst, unsigned OpNo, unsigned Reg,
-                                 unsigned HiReg, bool &containsReg) {
+static bool checkLowRegisterList(const MCInst &Inst, unsigned OpNo,
+                                 unsigned Reg, unsigned HiReg,
+                                 bool &containsReg) {
   containsReg = false;
   for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
     unsigned OpReg = Inst.getOperand(i).getReg();
@@ -6001,8 +6050,8 @@ static bool checkLowRegisterList(MCInst Inst, unsigned OpNo, unsigned Reg,
 
 // Check if the specified regisgter is in the register list of the inst,
 // starting at the indicated operand number.
-static bool listContainsReg(MCInst &Inst, unsigned OpNo, unsigned Reg) {
-  for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
+static bool listContainsReg(const MCInst &Inst, unsigned OpNo, unsigned Reg) {
+  for (unsigned i = OpNo, e = Inst.getNumOperands(); i < e; ++i) {
     unsigned OpReg = Inst.getOperand(i).getReg();
     if (OpReg == Reg)
       return true;
@@ -6020,7 +6069,7 @@ static bool instIsBreakpoint(const MCInst &Inst) {
 
 }
 
-bool ARMAsmParser::validatetLDMRegList(MCInst Inst,
+bool ARMAsmParser::validatetLDMRegList(const MCInst &Inst,
                                        const OperandVector &Operands,
                                        unsigned ListNo, bool IsARPop) {
   const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
@@ -6043,7 +6092,7 @@ bool ARMAsmParser::validatetLDMRegList(MCInst Inst,
   return false;
 }
 
-bool ARMAsmParser::validatetSTMRegList(MCInst Inst,
+bool ARMAsmParser::validatetSTMRegList(const MCInst &Inst,
                                        const OperandVector &Operands,
                                        unsigned ListNo) {
   const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
@@ -8167,8 +8216,16 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // If the destination and first source operand are the same, and
     // there's no setting of the flags, use encoding T2 instead of T3.
     // Note that this is only for ADD, not SUB. This mirrors the system
-    // 'as' behaviour. Make sure the wide encoding wasn't explicit.
-    if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
+    // 'as' behaviour.  Also take advantage of ADD being commutative.
+    // Make sure the wide encoding wasn't explicit.
+    bool Swap = false;
+    auto DestReg = Inst.getOperand(0).getReg();
+    bool Transform = DestReg == Inst.getOperand(1).getReg();
+    if (!Transform && DestReg == Inst.getOperand(2).getReg()) {
+      Transform = true;
+      Swap = true;
+    }
+    if (!Transform ||
         Inst.getOperand(5).getReg() != 0 ||
         (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
          static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
@@ -8177,7 +8234,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     TmpInst.setOpcode(ARM::tADDhirr);
     TmpInst.addOperand(Inst.getOperand(0));
     TmpInst.addOperand(Inst.getOperand(0));
-    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(Swap ? 1 : 2));
     TmpInst.addOperand(Inst.getOperand(3));
     TmpInst.addOperand(Inst.getOperand(4));
     Inst = TmpInst;
@@ -9176,8 +9233,7 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
     return false;
   }
 
-  STI.InitMCProcessorInfo(CPU, "");
-  STI.InitCPUSchedModel(CPU);
+  STI.setDefaultFeatures(CPU);
   setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
 
   return false;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 4d12bfb5d60f..d17fdb95dbdf 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -1362,7 +1362,7 @@ MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S) {
 MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S,
                                                 const MCSubtargetInfo &STI) {
   const Triple &TT = STI.getTargetTriple();
-  if (TT.getObjectFormat() == Triple::ELF)
+  if (TT.isOSBinFormatELF())
     return new ARMTargetELFStreamer(S);
   return new ARMTargetStreamer(S);
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index fafe25ae5be5..21c9fc1e58b2 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 #define GET_REGINFO_MC_DESC
 #include "ARMGenRegisterInfo.inc"
 
-static bool getMCRDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+static bool getMCRDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
                                   std::string &Info) {
   if (STI.getFeatureBits()[llvm::ARM::HasV7Ops] &&
       (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 15) &&
@@ -63,7 +63,7 @@ static bool getMCRDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
   return false;
 }
 
-static bool getITDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+static bool getITDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
                                  std::string &Info) {
   if (STI.getFeatureBits()[llvm::ARM::HasV8Ops] && MI.getOperand(1).isImm() &&
       MI.getOperand(1).getImm() != 8) {
@@ -75,7 +75,7 @@ static bool getITDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
   return false;
 }
 
-static bool getARMStoreDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+static bool getARMStoreDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
                                        std::string &Info) {
   assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] &&
          "cannot predicate thumb instructions");
@@ -92,7 +92,7 @@ static bool getARMStoreDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
   return false;
 }
 
-static bool getARMLoadDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+static bool getARMLoadDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
                                       std::string &Info) {
   assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] &&
          "cannot predicate thumb instructions");
@@ -257,9 +257,7 @@ MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT,
       ArchFS = FS;
   }
 
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitARMMCSubtargetInfo(X, TT, CPU, ArchFS);
-  return X;
+  return createARMMCSubtargetInfoImpl(TT, CPU, ArchFS);
 }
 
 static MCInstrInfo *createARMMCInstrInfo() {
@@ -268,7 +266,7 @@ static MCInstrInfo *createARMMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createARMMCRegisterInfo(StringRef Triple) {
+static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitARMMCRegisterInfo(X, ARM::LR, 0, 0, ARM::PC);
   return X;
@@ -279,10 +277,10 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
   MCAsmInfo *MAI;
   if (TheTriple.isOSDarwin() || TheTriple.isOSBinFormatMachO())
     MAI = new ARMMCAsmInfoDarwin(TheTriple);
-  else if (TheTriple.isWindowsItaniumEnvironment())
-    MAI = new ARMCOFFMCAsmInfoGNU();
   else if (TheTriple.isWindowsMSVCEnvironment())
     MAI = new ARMCOFFMCAsmInfoMicrosoft();
+  else if (TheTriple.isOSWindows())
+    MAI = new ARMCOFFMCAsmInfoGNU();
   else
     MAI = new ARMELFMCAsmInfo(TheTriple);
 
@@ -292,14 +290,13 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createARMMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createARMMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
                                              CodeModel::Model CM,
                                              CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
   if (RM == Reloc::Default) {
-    Triple TheTriple(TT);
     // Default relocation model on Darwin is PIC, not DynamicNoPIC.
-    RM = TheTriple.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC;
+    RM = TT.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC;
   }
   X->initMCCodeGenInfo(RM, CM, OL);
   return X;
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 77cd890e4cad..3b4358b5d9bf 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -365,7 +365,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
       // frame pointer stack slot, the target is ELF and the function has FP, or
       // the target uses var sized objects.
       if (NumBytes) {
-        assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) &&
+        assert(!MFI->getPristineRegs(MF).test(ARM::R4) &&
                "No scratch register to restore SP from FP!");
         emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
                                   TII, *RegInfo);
diff --git a/lib/Target/BPF/BPFFrameLowering.cpp b/lib/Target/BPF/BPFFrameLowering.cpp
index 54c5ececc7de..c2806c85f24f 100644
--- a/lib/Target/BPF/BPFFrameLowering.cpp
+++ b/lib/Target/BPF/BPFFrameLowering.cpp
@@ -29,12 +29,12 @@ void BPFFrameLowering::emitPrologue(MachineFunction &MF,
 void BPFFrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {}
 
-void BPFFrameLowering::processFunctionBeforeCalleeSavedScan(
-    MachineFunction &MF, RegScavenger *RS) const {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  MRI.setPhysRegUnused(BPF::R6);
-  MRI.setPhysRegUnused(BPF::R7);
-  MRI.setPhysRegUnused(BPF::R8);
-  MRI.setPhysRegUnused(BPF::R9);
+void BPFFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+  SavedRegs.reset(BPF::R6);
+  SavedRegs.reset(BPF::R7);
+  SavedRegs.reset(BPF::R8);
+  SavedRegs.reset(BPF::R9);
 }
diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h
index 3b9fc443e053..251cda965ff5 100644
--- a/lib/Target/BPF/BPFFrameLowering.h
+++ b/lib/Target/BPF/BPFFrameLowering.h
@@ -28,8 +28,8 @@ public:
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   bool hasFP(const MachineFunction &MF) const override;
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
 
   void
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index 38c56bbef81e..58498a1aec7d 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -302,8 +302,9 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     DAG.getContext()->diagnose(Err);
   }
 
+  auto PtrVT = getPointerTy(MF.getDataLayout());
   Chain = DAG.getCALLSEQ_START(
-      Chain, DAG.getConstant(NumBytes, CLI.DL, getPointerTy(), true), CLI.DL);
+      Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL);
 
   SmallVector<std::pair<unsigned, SDValue>, 5> RegsToPass;
 
@@ -350,10 +351,10 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), CLI.DL, getPointerTy(),
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), CLI.DL, PtrVT,
                                         G->getOffset(), 0);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
-    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy(), 0);
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
 
   // Returns a chain & a flag for retval copy to use.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -374,8 +375,8 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Create the CALLSEQ_END node.
   Chain = DAG.getCALLSEQ_END(
-      Chain, DAG.getConstant(NumBytes, CLI.DL, getPointerTy(), true),
-      DAG.getConstant(0, CLI.DL, getPointerTy(), true), InFlag, CLI.DL);
+      Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true),
+      DAG.getConstant(0, CLI.DL, PtrVT, true), InFlag, CLI.DL);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
diff --git a/lib/Target/BPF/BPFSubtarget.cpp b/lib/Target/BPF/BPFSubtarget.cpp
index 65acd585116d..c3a8b1caa63d 100644
--- a/lib/Target/BPF/BPFSubtarget.cpp
+++ b/lib/Target/BPF/BPFSubtarget.cpp
@@ -28,4 +28,4 @@ void BPFSubtarget::anchor() {}
 BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS, const TargetMachine &TM)
     : BPFGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this),
-      TLInfo(TM, *this), TSInfo(TM.getDataLayout()) {}
+      TLInfo(TM, *this) {}
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 3e928fc93a37..840570ebc392 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -40,7 +40,7 @@ static MCInstrInfo *createBPFMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createBPFMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createBPFMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitBPFMCRegisterInfo(X, BPF::R11 /* RAReg doesn't exist */);
   return X;
@@ -48,12 +48,10 @@ static MCRegisterInfo *createBPFMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT,
                                                  StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitBPFMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createBPFMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createBPFMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createBPFMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
                                              CodeModel::Model CM,
                                              CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index bc5d7f65b2f6..272688edb8a1 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -2148,8 +2148,8 @@ char CppWriter::ID = 0;
 
 bool CPPTargetMachine::addPassesToEmitFile(
     PassManagerBase &PM, raw_pwrite_stream &o, CodeGenFileType FileType,
-    bool DisableVerify, AnalysisID StartAfter, AnalysisID StopAfter,
-    MachineFunctionInitializer *MFInitializer) {
+    bool DisableVerify, AnalysisID StartBefore, AnalysisID StartAfter,
+    AnalysisID StopAfter, MachineFunctionInitializer *MFInitializer) {
   if (FileType != TargetMachine::CGFT_AssemblyFile)
     return true;
   auto FOut = llvm::make_unique<formatted_raw_ostream>(o);
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index ebf0635b12e4..00e402feffbc 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -31,7 +31,8 @@ struct CPPTargetMachine : public TargetMachine {
 public:
   bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out,
                            CodeGenFileType FileType, bool DisableVerify,
-                           AnalysisID StartAfter, AnalysisID StopAfter,
+                           AnalysisID StartBefore, AnalysisID StartAfter,
+                           AnalysisID StopAfter,
                            MachineFunctionInitializer *MFInitializer) override;
 };
 
diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp
new file mode 100644
index 000000000000..cb7e633fb82f
--- /dev/null
+++ b/lib/Target/Hexagon/BitTracker.cpp
@@ -0,0 +1,1127 @@
+//===--- BitTracker.cpp ---------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// SSA-based bit propagation.
+//
+// The purpose of this code is, for a given virtual register, to provide
+// information about the value of each bit in the register. The values
+// of bits are represented by the class BitValue, and take one of four
+// cases: 0, 1, "ref" and "bottom". The 0 and 1 are rather clear, the
+// "ref" value means that the bit is a copy of another bit (which itself
+// cannot be a copy of yet another bit---such chains are not allowed).
+// A "ref" value is associated with a BitRef structure, which indicates
+// which virtual register, and which bit in that register is the origin
+// of the value. For example, given an instruction
+//   vreg2 = ASL vreg1, 1
+// assuming that nothing is known about bits of vreg1, bit 1 of vreg2
+// will be a "ref" to (vreg1, 0). If there is a subsequent instruction
+//   vreg3 = ASL vreg2, 2
+// then bit 3 of vreg3 will be a "ref" to (vreg1, 0) as well.
+// The "bottom" case means that the bit's value cannot be determined,
+// and that this virtual register actually defines it. The "bottom" case
+// is discussed in detail in BitTracker.h. In fact, "bottom" is a "ref
+// to self", so for the vreg1 above, the bit 0 of it will be a "ref" to
+// (vreg1, 0), bit 1 will be a "ref" to (vreg1, 1), etc.
+//
+// The tracker implements the Wegman-Zadeck algorithm, originally developed
+// for SSA-based constant propagation. Each register is represented as
+// a sequence of bits, with the convention that bit 0 is the least signi-
+// ficant bit. Each bit is propagated individually. The class RegisterCell
+// implements the register's representation, and is also the subject of
+// the lattice operations in the tracker.
+//
+// The intended usage of the bit tracker is to create a target-specific
+// machine instruction evaluator, pass the evaluator to the BitTracker
+// object, and run the tracker. The tracker will then collect the bit
+// value information for a given machine function. After that, it can be
+// queried for the cells for each virtual register.
+// Sample code:
+//   const TargetSpecificEvaluator TSE(TRI, MRI);
+//   BitTracker BT(TSE, MF);
+//   BT.run();
+//   ...
+//   unsigned Reg = interestingRegister();
+//   RegisterCell RC = BT.get(Reg);
+//   if (RC[3].is(1))
+//      Reg0bit3 = 1;
+//
+// The code below is intended to be fully target-independent.
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include "BitTracker.h"
+
+using namespace llvm;
+
+typedef BitTracker BT;
+
+namespace {
+  // Local trickery to pretty print a register (without the whole "%vreg"
+  // business).
+  struct printv {
+    printv(unsigned r) : R(r) {}
+    unsigned R;
+  };
+  raw_ostream &operator<< (raw_ostream &OS, const printv &PV) {
+    if (PV.R)
+      OS << 'v' << TargetRegisterInfo::virtReg2Index(PV.R);
+    else
+      OS << 's';
+    return OS;
+  }
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const BT::BitValue &BV) {
+  switch (BV.Type) {
+    case BT::BitValue::Top:
+      OS << 'T';
+      break;
+    case BT::BitValue::Zero:
+      OS << '0';
+      break;
+    case BT::BitValue::One:
+      OS << '1';
+      break;
+    case BT::BitValue::Ref:
+      OS << printv(BV.RefI.Reg) << '[' << BV.RefI.Pos << ']';
+      break;
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const BT::RegisterCell &RC) {
+  unsigned n = RC.Bits.size();
+  OS << "{ w:" << n;
+  // Instead of printing each bit value individually, try to group them
+  // into logical segments, such as sequences of 0 or 1 bits or references
+  // to consecutive bits (e.g. "bits 3-5 are same as bits 7-9 of reg xyz").
+  // "Start" will be the index of the beginning of the most recent segment.
+  unsigned Start = 0;
+  bool SeqRef = false;    // A sequence of refs to consecutive bits.
+  bool ConstRef = false;  // A sequence of refs to the same bit.
+
+  for (unsigned i = 1, n = RC.Bits.size(); i < n; ++i) {
+    const BT::BitValue &V = RC[i];
+    const BT::BitValue &SV = RC[Start];
+    bool IsRef = (V.Type == BT::BitValue::Ref);
+    // If the current value is the same as Start, skip to the next one.
+    if (!IsRef && V == SV)
+      continue;
+    if (IsRef && SV.Type == BT::BitValue::Ref && V.RefI.Reg == SV.RefI.Reg) {
+      if (Start+1 == i) {
+        SeqRef = (V.RefI.Pos == SV.RefI.Pos+1);
+        ConstRef = (V.RefI.Pos == SV.RefI.Pos);
+      }
+      if (SeqRef && V.RefI.Pos == SV.RefI.Pos+(i-Start))
+        continue;
+      if (ConstRef && V.RefI.Pos == SV.RefI.Pos)
+        continue;
+    }
+
+    // The current value is different. Print the previous one and reset
+    // the Start.
+    OS << " [" << Start;
+    unsigned Count = i - Start;
+    if (Count == 1) {
+      OS << "]:" << SV;
+    } else {
+      OS << '-' << i-1 << "]:";
+      if (SV.Type == BT::BitValue::Ref && SeqRef)
+        OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-'
+           << SV.RefI.Pos+(Count-1) << ']';
+      else
+        OS << SV;
+    }
+    Start = i;
+    SeqRef = ConstRef = false;
+  }
+
+  OS << " [" << Start;
+  unsigned Count = n - Start;
+  if (n-Start == 1) {
+    OS << "]:" << RC[Start];
+  } else {
+    OS << '-' << n-1 << "]:";
+    const BT::BitValue &SV = RC[Start];
+    if (SV.Type == BT::BitValue::Ref && SeqRef)
+      OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-'
+         << SV.RefI.Pos+(Count-1) << ']';
+    else
+      OS << SV;
+  }
+  OS << " }";
+
+  return OS;
+}
+
+BitTracker::BitTracker(const MachineEvaluator &E, MachineFunction &F)
+    : Trace(false), ME(E), MF(F), MRI(F.getRegInfo()), Map(*new CellMapType) {}
+
+BitTracker::~BitTracker() {
+  delete &Map;
+}
+
+
+// If we were allowed to update a cell for a part of a register, the meet
+// operation would need to be parametrized by the register number and the
+// exact part of the register, so that the computer BitRefs correspond to
+// the actual bits of the "self" register.
+// While this cannot happen in the current implementation, I'm not sure
+// if this should be ruled out in the future.
+bool BT::RegisterCell::meet(const RegisterCell &RC, unsigned SelfR) {
+  // An example when "meet" can be invoked with SelfR == 0 is a phi node
+  // with a physical register as an operand.
+  assert(SelfR == 0 || TargetRegisterInfo::isVirtualRegister(SelfR));
+  bool Changed = false;
+  for (uint16_t i = 0, n = Bits.size(); i < n; ++i) {
+    const BitValue &RCV = RC[i];
+    Changed |= Bits[i].meet(RCV, BitRef(SelfR, i));
+  }
+  return Changed;
+}
+
+
+// Insert the entire cell RC into the current cell at position given by M.
+BT::RegisterCell &BT::RegisterCell::insert(const BT::RegisterCell &RC,
+      const BitMask &M) {
+  uint16_t B = M.first(), E = M.last(), W = width();
+  // Sanity: M must be a valid mask for *this.
+  assert(B < W && E < W);
+  // Sanity: the masked part of *this must have the same number of bits
+  // as the source.
+  assert(B > E || E-B+1 == RC.width());      // B <= E  =>  E-B+1 = |RC|.
+  assert(B <= E || E+(W-B)+1 == RC.width()); // E < B   =>  E+(W-B)+1 = |RC|.
+  if (B <= E) {
+    for (uint16_t i = 0; i <= E-B; ++i)
+      Bits[i+B] = RC[i];
+  } else {
+    for (uint16_t i = 0; i < W-B; ++i)
+      Bits[i+B] = RC[i];
+    for (uint16_t i = 0; i <= E; ++i)
+      Bits[i] = RC[i+(W-B)];
+  }
+  return *this;
+}
+
+
+BT::RegisterCell BT::RegisterCell::extract(const BitMask &M) const {
+  uint16_t B = M.first(), E = M.last(), W = width();
+  assert(B < W && E < W);
+  if (B <= E) {
+    RegisterCell RC(E-B+1);
+    for (uint16_t i = B; i <= E; ++i)
+      RC.Bits[i-B] = Bits[i];
+    return RC;
+  }
+
+  RegisterCell RC(E+(W-B)+1);
+  for (uint16_t i = 0; i < W-B; ++i)
+    RC.Bits[i] = Bits[i+B];
+  for (uint16_t i = 0; i <= E; ++i)
+    RC.Bits[i+(W-B)] = Bits[i];
+  return RC;
+}
+
+
+BT::RegisterCell &BT::RegisterCell::rol(uint16_t Sh) {
+  // Rotate left (i.e. towards increasing bit indices).
+  // Swap the two parts:  [0..W-Sh-1] [W-Sh..W-1]
+  uint16_t W = width();
+  Sh = Sh % W;
+  if (Sh == 0)
+    return *this;
+
+  RegisterCell Tmp(W-Sh);
+  // Tmp = [0..W-Sh-1].
+  for (uint16_t i = 0; i < W-Sh; ++i)
+    Tmp[i] = Bits[i];
+  // Shift [W-Sh..W-1] to [0..Sh-1].
+  for (uint16_t i = 0; i < Sh; ++i)
+    Bits[i] = Bits[W-Sh+i];
+  // Copy Tmp to [Sh..W-1].
+  for (uint16_t i = 0; i < W-Sh; ++i)
+    Bits[i+Sh] = Tmp.Bits[i];
+  return *this;
+}
+
+
+BT::RegisterCell &BT::RegisterCell::fill(uint16_t B, uint16_t E,
+      const BitValue &V) {
+  assert(B <= E);
+  while (B < E)
+    Bits[B++] = V;
+  return *this;
+}
+
+
+BT::RegisterCell &BT::RegisterCell::cat(const RegisterCell &RC) {
+  // Append the cell given as the argument to the "this" cell.
+  // Bit 0 of RC becomes bit W of the result, where W is this->width().
+  uint16_t W = width(), WRC = RC.width();
+  Bits.resize(W+WRC);
+  for (uint16_t i = 0; i < WRC; ++i)
+    Bits[i+W] = RC.Bits[i];
+  return *this;
+}
+
+
+uint16_t BT::RegisterCell::ct(bool B) const {
+  uint16_t W = width();
+  uint16_t C = 0;
+  BitValue V = B;
+  while (C < W && Bits[C] == V)
+    C++;
+  return C;
+}
+
+
+uint16_t BT::RegisterCell::cl(bool B) const {
+  uint16_t W = width();
+  uint16_t C = 0;
+  BitValue V = B;
+  while (C < W && Bits[W-(C+1)] == V)
+    C++;
+  return C;
+}
+
+
+bool BT::RegisterCell::operator== (const RegisterCell &RC) const {
+  uint16_t W = Bits.size();
+  if (RC.Bits.size() != W)
+    return false;
+  for (uint16_t i = 0; i < W; ++i)
+    if (Bits[i] != RC[i])
+      return false;
+  return true;
+}
+
+
+uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
+  // The general problem is with finding a register class that corresponds
+  // to a given reference reg:sub. There can be several such classes, and
+  // since we only care about the register size, it does not matter which
+  // such class we would find.
+  // The easiest way to accomplish what we want is to
+  // 1. find a physical register PhysR from the same class as RR.Reg,
+  // 2. find a physical register PhysS that corresponds to PhysR:RR.Sub,
+  // 3. find a register class that contains PhysS.
+  unsigned PhysR;
+  if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) {
+    const TargetRegisterClass *VC = MRI.getRegClass(RR.Reg);
+    assert(VC->begin() != VC->end() && "Empty register class");
+    PhysR = *VC->begin();
+  } else {
+    assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg));
+    PhysR = RR.Reg;
+  }
+
+  unsigned PhysS = (RR.Sub == 0) ? PhysR : TRI.getSubReg(PhysR, RR.Sub);
+  const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PhysS);
+  uint16_t BW = RC->getSize()*8;
+  return BW;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR,
+      const CellMapType &M) const {
+  uint16_t BW = getRegBitWidth(RR);
+
+  // Physical registers are assumed to be present in the map with an unknown
+  // value. Don't actually insert anything in the map, just return the cell.
+  if (TargetRegisterInfo::isPhysicalRegister(RR.Reg))
+    return RegisterCell::self(0, BW);
+
+  assert(TargetRegisterInfo::isVirtualRegister(RR.Reg));
+  // For virtual registers that belong to a class that is not tracked,
+  // generate an "unknown" value as well.
+  const TargetRegisterClass *C = MRI.getRegClass(RR.Reg);
+  if (!track(C))
+    return RegisterCell::self(0, BW);
+
+  CellMapType::const_iterator F = M.find(RR.Reg);
+  if (F != M.end()) {
+    if (!RR.Sub)
+      return F->second;
+    BitMask M = mask(RR.Reg, RR.Sub);
+    return F->second.extract(M);
+  }
+  // If not found, create a "top" entry, but do not insert it in the map.
+  return RegisterCell::top(BW);
+}
+
+
+void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
+      CellMapType &M) const {
+  // While updating the cell map can be done in a meaningful way for
+  // a part of a register, it makes little sense to implement it as the
+  // SSA representation would never contain such "partial definitions".
+  if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+    return;
+  assert(RR.Sub == 0 && "Unexpected sub-register in definition");
+  // Eliminate all ref-to-reg-0 bit values: replace them with "self".
+  for (unsigned i = 0, n = RC.width(); i < n; ++i) {
+    const BitValue &V = RC[i];
+    if (V.Type == BitValue::Ref && V.RefI.Reg == 0)
+      RC[i].RefI = BitRef(RR.Reg, i);
+  }
+  M[RR.Reg] = RC;
+}
+
+
+// Check if the cell represents a compile-time integer value.
+bool BT::MachineEvaluator::isInt(const RegisterCell &A) const {
+  uint16_t W = A.width();
+  for (uint16_t i = 0; i < W; ++i)
+    if (!A[i].is(0) && !A[i].is(1))
+      return false;
+  return true;
+}
+
+
+// Convert a cell to the integer value. The result must fit in uint64_t.
+uint64_t BT::MachineEvaluator::toInt(const RegisterCell &A) const {
+  assert(isInt(A));
+  uint64_t Val = 0;
+  uint16_t W = A.width();
+  for (uint16_t i = 0; i < W; ++i) {
+    Val <<= 1;
+    Val |= A[i].is(1);
+  }
+  return Val;
+}
+
+
+// Evaluator helper functions. These implement some common operation on
+// register cells that can be used to implement target-specific instructions
+// in a target-specific evaluator.
+
+BT::RegisterCell BT::MachineEvaluator::eIMM(int64_t V, uint16_t W) const {
+  RegisterCell Res(W);
+  // For bits beyond the 63rd, this will generate the sign bit of V.
+  for (uint16_t i = 0; i < W; ++i) {
+    Res[i] = BitValue(V & 1);
+    V >>= 1;
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eIMM(const ConstantInt *CI) const {
+  APInt A = CI->getValue();
+  uint16_t BW = A.getBitWidth();
+  assert((unsigned)BW == A.getBitWidth() && "BitWidth overflow");
+  RegisterCell Res(BW);
+  for (uint16_t i = 0; i < BW; ++i)
+    Res[i] = A[i];
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eADD(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  bool Carry = false;
+  uint16_t I;
+  for (I = 0; I < W; ++I) {
+    const BitValue &V1 = A1[I];
+    const BitValue &V2 = A2[I];
+    if (!V1.num() || !V2.num())
+      break;
+    unsigned S = bool(V1) + bool(V2) + Carry;
+    Res[I] = BitValue(S & 1);
+    Carry = (S > 1);
+  }
+  for (; I < W; ++I) {
+    const BitValue &V1 = A1[I];
+    const BitValue &V2 = A2[I];
+    // If the next bit is same as Carry, the result will be 0 plus the
+    // other bit. The Carry bit will remain unchanged.
+    if (V1.is(Carry))
+      Res[I] = BitValue::ref(V2);
+    else if (V2.is(Carry))
+      Res[I] = BitValue::ref(V1);
+    else
+      break;
+  }
+  for (; I < W; ++I)
+    Res[I] = BitValue::self();
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eSUB(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  bool Borrow = false;
+  uint16_t I;
+  for (I = 0; I < W; ++I) {
+    const BitValue &V1 = A1[I];
+    const BitValue &V2 = A2[I];
+    if (!V1.num() || !V2.num())
+      break;
+    unsigned S = bool(V1) - bool(V2) - Borrow;
+    Res[I] = BitValue(S & 1);
+    Borrow = (S > 1);
+  }
+  for (; I < W; ++I) {
+    const BitValue &V1 = A1[I];
+    const BitValue &V2 = A2[I];
+    if (V1.is(Borrow)) {
+      Res[I] = BitValue::ref(V2);
+      break;
+    }
+    if (V2.is(Borrow))
+      Res[I] = BitValue::ref(V1);
+    else
+      break;
+  }
+  for (; I < W; ++I)
+    Res[I] = BitValue::self();
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eMLS(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width() + A2.width();
+  uint16_t Z = A1.ct(0) + A2.ct(0);
+  RegisterCell Res(W);
+  Res.fill(0, Z, BitValue::Zero);
+  Res.fill(Z, W, BitValue::self());
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eMLU(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width() + A2.width();
+  uint16_t Z = A1.ct(0) + A2.ct(0);
+  RegisterCell Res(W);
+  Res.fill(0, Z, BitValue::Zero);
+  Res.fill(Z, W, BitValue::self());
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eASL(const RegisterCell &A1,
+      uint16_t Sh) const {
+  assert(Sh <= A1.width());
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res.rol(Sh);
+  Res.fill(0, Sh, BitValue::Zero);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eLSR(const RegisterCell &A1,
+      uint16_t Sh) const {
+  uint16_t W = A1.width();
+  assert(Sh <= W);
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res.rol(W-Sh);
+  Res.fill(W-Sh, W, BitValue::Zero);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eASR(const RegisterCell &A1,
+      uint16_t Sh) const {
+  uint16_t W = A1.width();
+  assert(Sh <= W);
+  RegisterCell Res = RegisterCell::ref(A1);
+  BitValue Sign = Res[W-1];
+  Res.rol(W-Sh);
+  Res.fill(W-Sh, W, Sign);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eAND(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitValue &V1 = A1[i];
+    const BitValue &V2 = A2[i];
+    if (V1.is(1))
+      Res[i] = BitValue::ref(V2);
+    else if (V2.is(1))
+      Res[i] = BitValue::ref(V1);
+    else if (V1.is(0) || V2.is(0))
+      Res[i] = BitValue::Zero;
+    else if (V1 == V2)
+      Res[i] = V1;
+    else
+      Res[i] = BitValue::self();
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eORL(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitValue &V1 = A1[i];
+    const BitValue &V2 = A2[i];
+    if (V1.is(1) || V2.is(1))
+      Res[i] = BitValue::One;
+    else if (V1.is(0))
+      Res[i] = BitValue::ref(V2);
+    else if (V2.is(0))
+      Res[i] = BitValue::ref(V1);
+    else if (V1 == V2)
+      Res[i] = V1;
+    else
+      Res[i] = BitValue::self();
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eXOR(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitValue &V1 = A1[i];
+    const BitValue &V2 = A2[i];
+    if (V1.is(0))
+      Res[i] = BitValue::ref(V2);
+    else if (V2.is(0))
+      Res[i] = BitValue::ref(V1);
+    else if (V1 == V2)
+      Res[i] = BitValue::Zero;
+    else
+      Res[i] = BitValue::self();
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eNOT(const RegisterCell &A1) const {
+  uint16_t W = A1.width();
+  RegisterCell Res(W);
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitValue &V = A1[i];
+    if (V.is(0))
+      Res[i] = BitValue::One;
+    else if (V.is(1))
+      Res[i] = BitValue::Zero;
+    else
+      Res[i] = BitValue::self();
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eSET(const RegisterCell &A1,
+      uint16_t BitN) const {
+  assert(BitN < A1.width());
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res[BitN] = BitValue::One;
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eCLR(const RegisterCell &A1,
+      uint16_t BitN) const {
+  assert(BitN < A1.width());
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res[BitN] = BitValue::Zero;
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eCLB(const RegisterCell &A1, bool B,
+      uint16_t W) const {
+  uint16_t C = A1.cl(B), AW = A1.width();
+  // If the last leading non-B bit is not a constant, then we don't know
+  // the real count.
+  if ((C < AW && A1[AW-1-C].num()) || C == AW)
+    return eIMM(C, W);
+  return RegisterCell::self(0, W);
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eCTB(const RegisterCell &A1, bool B,
+      uint16_t W) const {
+  uint16_t C = A1.ct(B), AW = A1.width();
+  // If the last trailing non-B bit is not a constant, then we don't know
+  // the real count.
+  if ((C < AW && A1[C].num()) || C == AW)
+    return eIMM(C, W);
+  return RegisterCell::self(0, W);
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eSXT(const RegisterCell &A1,
+      uint16_t FromN) const {
+  uint16_t W = A1.width();
+  assert(FromN <= W);
+  RegisterCell Res = RegisterCell::ref(A1);
+  BitValue Sign = Res[FromN-1];
+  // Sign-extend "inreg".
+  Res.fill(FromN, W, Sign);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eZXT(const RegisterCell &A1,
+      uint16_t FromN) const {
+  uint16_t W = A1.width();
+  assert(FromN <= W);
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res.fill(FromN, W, BitValue::Zero);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eXTR(const RegisterCell &A1,
+      uint16_t B, uint16_t E) const {
+  uint16_t W = A1.width();
+  assert(B < W && E <= W);
+  if (B == E)
+    return RegisterCell(0);
+  uint16_t Last = (E > 0) ? E-1 : W-1;
+  RegisterCell Res = RegisterCell::ref(A1).extract(BT::BitMask(B, Last));
+  // Return shorter cell.
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eINS(const RegisterCell &A1,
+      const RegisterCell &A2, uint16_t AtN) const {
+  uint16_t W1 = A1.width(), W2 = A2.width();
+  (void)W1;
+  assert(AtN < W1 && AtN+W2 <= W1);
+  // Copy bits from A1, insert A2 at position AtN.
+  RegisterCell Res = RegisterCell::ref(A1);
+  if (W2 > 0)
+    Res.insert(RegisterCell::ref(A2), BT::BitMask(AtN, AtN+W2-1));
+  return Res;
+}
+
+
+BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const {
+  assert(Sub == 0 && "Generic BitTracker::mask called for Sub != 0");
+  uint16_t W = getRegBitWidth(Reg);
+  assert(W > 0 && "Cannot generate mask for empty register");
+  return BitMask(0, W-1);
+}
+
+
+bool BT::MachineEvaluator::evaluate(const MachineInstr *MI,
+      const CellMapType &Inputs, CellMapType &Outputs) const {
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case TargetOpcode::REG_SEQUENCE: {
+      RegisterRef RD = MI->getOperand(0);
+      assert(RD.Sub == 0);
+      RegisterRef RS = MI->getOperand(1);
+      unsigned SS = MI->getOperand(2).getImm();
+      RegisterRef RT = MI->getOperand(3);
+      unsigned ST = MI->getOperand(4).getImm();
+      assert(SS != ST);
+
+      uint16_t W = getRegBitWidth(RD);
+      RegisterCell Res(W);
+      Res.insert(RegisterCell::ref(getCell(RS, Inputs)), mask(RD.Reg, SS));
+      Res.insert(RegisterCell::ref(getCell(RT, Inputs)), mask(RD.Reg, ST));
+      putCell(RD, Res, Outputs);
+      break;
+    }
+
+    case TargetOpcode::COPY: {
+      // COPY can transfer a smaller register into a wider one.
+      // If that is the case, fill the remaining high bits with 0.
+      RegisterRef RD = MI->getOperand(0);
+      RegisterRef RS = MI->getOperand(1);
+      assert(RD.Sub == 0);
+      uint16_t WD = getRegBitWidth(RD);
+      uint16_t WS = getRegBitWidth(RS);
+      assert(WD >= WS);
+      RegisterCell Src = getCell(RS, Inputs);
+      RegisterCell Res(WD);
+      Res.insert(Src, BitMask(0, WS-1));
+      Res.fill(WS, WD, BitValue::Zero);
+      putCell(RD, Res, Outputs);
+      break;
+    }
+
+    default:
+      return false;
+  }
+
+  return true;
+}
+
+
+// Main W-Z implementation.
+
+void BT::visitPHI(const MachineInstr *PI) {
+  int ThisN = PI->getParent()->getNumber();
+  if (Trace)
+    dbgs() << "Visit FI(BB#" << ThisN << "): " << *PI;
+
+  const MachineOperand &MD = PI->getOperand(0);
+  assert(MD.getSubReg() == 0 && "Unexpected sub-register in definition");
+  RegisterRef DefRR(MD);
+  uint16_t DefBW = ME.getRegBitWidth(DefRR);
+
+  RegisterCell DefC = ME.getCell(DefRR, Map);
+  if (DefC == RegisterCell::self(DefRR.Reg, DefBW))    // XXX slow
+    return;
+
+  bool Changed = false;
+
+  for (unsigned i = 1, n = PI->getNumOperands(); i < n; i += 2) {
+    const MachineBasicBlock *PB = PI->getOperand(i+1).getMBB();
+    int PredN = PB->getNumber();
+    if (Trace)
+      dbgs() << "  edge BB#" << PredN << "->BB#" << ThisN;
+    if (!EdgeExec.count(CFGEdge(PredN, ThisN))) {
+      if (Trace)
+        dbgs() << " not executable\n";
+      continue;
+    }
+
+    RegisterRef RU = PI->getOperand(i);
+    RegisterCell ResC = ME.getCell(RU, Map);
+    if (Trace)
+      dbgs() << " input reg: " << PrintReg(RU.Reg, &ME.TRI, RU.Sub)
+             << " cell: " << ResC << "\n";
+    Changed |= DefC.meet(ResC, DefRR.Reg);
+  }
+
+  if (Changed) {
+    if (Trace)
+      dbgs() << "Output: " << PrintReg(DefRR.Reg, &ME.TRI, DefRR.Sub)
+             << " cell: " << DefC << "\n";
+    ME.putCell(DefRR, DefC, Map);
+    visitUsesOf(DefRR.Reg);
+  }
+}
+
+
+void BT::visitNonBranch(const MachineInstr *MI) {
+  if (Trace) {
+    int ThisN = MI->getParent()->getNumber();
+    dbgs() << "Visit MI(BB#" << ThisN << "): " << *MI;
+  }
+  if (MI->isDebugValue())
+    return;
+  assert(!MI->isBranch() && "Unexpected branch instruction");
+
+  CellMapType ResMap;
+  bool Eval = ME.evaluate(MI, Map, ResMap);
+
+  if (Trace && Eval) {
+    for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      RegisterRef RU(MO);
+      dbgs() << "  input reg: " << PrintReg(RU.Reg, &ME.TRI, RU.Sub)
+             << " cell: " << ME.getCell(RU, Map) << "\n";
+    }
+    dbgs() << "Outputs:\n";
+    for (CellMapType::iterator I = ResMap.begin(), E = ResMap.end();
+         I != E; ++I) {
+      RegisterRef RD(I->first);
+      dbgs() << "  " << PrintReg(I->first, &ME.TRI) << " cell: "
+             << ME.getCell(RD, ResMap) << "\n";
+    }
+  }
+
+  // Iterate over all definitions of the instruction, and update the
+  // cells accordingly.
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    // Visit register defs only.
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    RegisterRef RD(MO);
+    assert(RD.Sub == 0 && "Unexpected sub-register in definition");
+    if (!TargetRegisterInfo::isVirtualRegister(RD.Reg))
+      continue;
+
+    bool Changed = false;
+    if (!Eval || !ResMap.has(RD.Reg)) {
+      // Set to "ref" (aka "bottom").
+      uint16_t DefBW = ME.getRegBitWidth(RD);
+      RegisterCell RefC = RegisterCell::self(RD.Reg, DefBW);
+      if (RefC != ME.getCell(RD, Map)) {
+        ME.putCell(RD, RefC, Map);
+        Changed = true;
+      }
+    } else {
+      RegisterCell DefC = ME.getCell(RD, Map);
+      RegisterCell ResC = ME.getCell(RD, ResMap);
+      // This is a non-phi instruction, so the values of the inputs come
+      // from the same registers each time this instruction is evaluated.
+      // During the propagation, the values of the inputs can become lowered
+      // in the sense of the lattice operation, which may cause different
+      // results to be calculated in subsequent evaluations. This should
+      // not cause the bottoming of the result in the map, since the new
+      // result is already reflecting the lowered inputs.
+      for (uint16_t i = 0, w = DefC.width(); i < w; ++i) {
+        BitValue &V = DefC[i];
+        // Bits that are already "bottom" should not be updated.
+        if (V.Type == BitValue::Ref && V.RefI.Reg == RD.Reg)
+          continue;
+        // Same for those that are identical in DefC and ResC.
+        if (V == ResC[i])
+          continue;
+        V = ResC[i];
+        Changed = true;
+      }
+      if (Changed)
+        ME.putCell(RD, DefC, Map);
+    }
+    if (Changed)
+      visitUsesOf(RD.Reg);
+  }
+}
+
+
+void BT::visitBranchesFrom(const MachineInstr *BI) {
+  const MachineBasicBlock &B = *BI->getParent();
+  MachineBasicBlock::const_iterator It = BI, End = B.end();
+  BranchTargetList Targets, BTs;
+  bool FallsThrough = true, DefaultToAll = false;
+  int ThisN = B.getNumber();
+
+  do {
+    BTs.clear();
+    const MachineInstr *MI = &*It;
+    if (Trace)
+      dbgs() << "Visit BR(BB#" << ThisN << "): " << *MI;
+    assert(MI->isBranch() && "Expecting branch instruction");
+    InstrExec.insert(MI);
+    bool Eval = ME.evaluate(MI, Map, BTs, FallsThrough);
+    if (!Eval) {
+      // If the evaluation failed, we will add all targets. Keep going in
+      // the loop to mark all executable branches as such.
+      DefaultToAll = true;
+      FallsThrough = true;
+      if (Trace)
+        dbgs() << "  failed to evaluate: will add all CFG successors\n";
+    } else if (!DefaultToAll) {
+      // If evaluated successfully add the targets to the cumulative list.
+      if (Trace) {
+        dbgs() << "  adding targets:";
+        for (unsigned i = 0, n = BTs.size(); i < n; ++i)
+          dbgs() << " BB#" << BTs[i]->getNumber();
+        if (FallsThrough)
+          dbgs() << "\n  falls through\n";
+        else
+          dbgs() << "\n  does not fall through\n";
+      }
+      Targets.insert(BTs.begin(), BTs.end());
+    }
+    ++It;
+  } while (FallsThrough && It != End);
+
+  typedef MachineBasicBlock::const_succ_iterator succ_iterator;
+  if (!DefaultToAll) {
+    // Need to add all CFG successors that lead to EH landing pads.
+    // There won't be explicit branches to these blocks, but they must
+    // be processed.
+    for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I) {
+      const MachineBasicBlock *SB = *I;
+      if (SB->isLandingPad())
+        Targets.insert(SB);
+    }
+    if (FallsThrough) {
+      MachineFunction::const_iterator BIt = &B;
+      MachineFunction::const_iterator Next = std::next(BIt);
+      if (Next != MF.end())
+        Targets.insert(&*Next);
+    }
+  } else {
+    for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I)
+      Targets.insert(*I);
+  }
+
+  for (unsigned i = 0, n = Targets.size(); i < n; ++i) {
+    int TargetN = Targets[i]->getNumber();
+    FlowQ.push(CFGEdge(ThisN, TargetN));
+  }
+}
+
+
+void BT::visitUsesOf(unsigned Reg) {
+  if (Trace)
+    dbgs() << "visiting uses of " << PrintReg(Reg, &ME.TRI) << "\n";
+
+  typedef MachineRegisterInfo::use_nodbg_iterator use_iterator;
+  use_iterator End = MRI.use_nodbg_end();
+  for (use_iterator I = MRI.use_nodbg_begin(Reg); I != End; ++I) {
+    MachineInstr *UseI = I->getParent();
+    if (!InstrExec.count(UseI))
+      continue;
+    if (UseI->isPHI())
+      visitPHI(UseI);
+    else if (!UseI->isBranch())
+      visitNonBranch(UseI);
+    else
+      visitBranchesFrom(UseI);
+  }
+}
+
+
+BT::RegisterCell BT::get(RegisterRef RR) const {
+  return ME.getCell(RR, Map);
+}
+
+
+void BT::put(RegisterRef RR, const RegisterCell &RC) {
+  ME.putCell(RR, RC, Map);
+}
+
+
+// Replace all references to bits from OldRR with the corresponding bits
+// in NewRR.
+void BT::subst(RegisterRef OldRR, RegisterRef NewRR) {
+  assert(Map.has(OldRR.Reg) && "OldRR not present in map");
+  BitMask OM = ME.mask(OldRR.Reg, OldRR.Sub);
+  BitMask NM = ME.mask(NewRR.Reg, NewRR.Sub);
+  uint16_t OMB = OM.first(), OME = OM.last();
+  uint16_t NMB = NM.first(), NME = NM.last();
+  (void)NME;
+  assert((OME-OMB == NME-NMB) &&
+         "Substituting registers of different lengths");
+  for (CellMapType::iterator I = Map.begin(), E = Map.end(); I != E; ++I) {
+    RegisterCell &RC = I->second;
+    for (uint16_t i = 0, w = RC.width(); i < w; ++i) {
+      BitValue &V = RC[i];
+      if (V.Type != BitValue::Ref || V.RefI.Reg != OldRR.Reg)
+        continue;
+      if (V.RefI.Pos < OMB || V.RefI.Pos > OME)
+        continue;
+      V.RefI.Reg = NewRR.Reg;
+      V.RefI.Pos += NMB-OMB;
+    }
+  }
+}
+
+
+// Check if the block has been "executed" during propagation. (If not, the
+// block is dead, but it may still appear to be reachable.)
+bool BT::reached(const MachineBasicBlock *B) const {
+  int BN = B->getNumber();
+  assert(BN >= 0);
+  for (EdgeSetType::iterator I = EdgeExec.begin(), E = EdgeExec.end();
+       I != E; ++I) {
+    if (I->second == BN)
+      return true;
+  }
+  return false;
+}
+
+
+void BT::reset() {
+  EdgeExec.clear();
+  InstrExec.clear();
+  Map.clear();
+}
+
+
+void BT::run() {
+  reset();
+  assert(FlowQ.empty());
+
+  typedef GraphTraits<const MachineFunction*> MachineFlowGraphTraits;
+  const MachineBasicBlock *Entry = MachineFlowGraphTraits::getEntryNode(&MF);
+
+  unsigned MaxBN = 0;
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    assert(I->getNumber() >= 0 && "Disconnected block");
+    unsigned BN = I->getNumber();
+    if (BN > MaxBN)
+      MaxBN = BN;
+  }
+
+  // Keep track of visited blocks.
+  BitVector BlockScanned(MaxBN+1);
+
+  int EntryN = Entry->getNumber();
+  // Generate a fake edge to get something to start with.
+  FlowQ.push(CFGEdge(-1, EntryN));
+
+  while (!FlowQ.empty()) {
+    CFGEdge Edge = FlowQ.front();
+    FlowQ.pop();
+
+    if (EdgeExec.count(Edge))
+      continue;
+    EdgeExec.insert(Edge);
+
+    const MachineBasicBlock &B = *MF.getBlockNumbered(Edge.second);
+    MachineBasicBlock::const_iterator It = B.begin(), End = B.end();
+    // Visit PHI nodes first.
+    while (It != End && It->isPHI()) {
+      const MachineInstr *PI = &*It++;
+      InstrExec.insert(PI);
+      visitPHI(PI);
+    }
+
+    // If this block has already been visited through a flow graph edge,
+    // then the instructions have already been processed. Any updates to
+    // the cells would now only happen through visitUsesOf...
+    if (BlockScanned[Edge.second])
+      continue;
+    BlockScanned[Edge.second] = true;
+
+    // Visit non-branch instructions.
+    while (It != End && !It->isBranch()) {
+      const MachineInstr *MI = &*It++;
+      InstrExec.insert(MI);
+      visitNonBranch(MI);
+    }
+    // If block end has been reached, add the fall-through edge to the queue.
+    if (It == End) {
+      MachineFunction::const_iterator BIt = &B;
+      MachineFunction::const_iterator Next = std::next(BIt);
+      if (Next != MF.end()) {
+        int ThisN = B.getNumber();
+        int NextN = Next->getNumber();
+        FlowQ.push(CFGEdge(ThisN, NextN));
+      }
+    } else {
+      // Handle the remaining sequence of branches. This function will update
+      // the work queue.
+      visitBranchesFrom(It);
+    }
+  } // while (!FlowQ->empty())
+
+  if (Trace) {
+    dbgs() << "Cells after propagation:\n";
+    for (CellMapType::iterator I = Map.begin(), E = Map.end(); I != E; ++I)
+      dbgs() << PrintReg(I->first, &ME.TRI) << " -> " << I->second << "\n";
+  }
+}
+
diff --git a/lib/Target/Hexagon/BitTracker.h b/lib/Target/Hexagon/BitTracker.h
new file mode 100644
index 000000000000..ed002a794d66
--- /dev/null
+++ b/lib/Target/Hexagon/BitTracker.h
@@ -0,0 +1,449 @@
+//===--- BitTracker.h -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BITTRACKER_H
+#define BITTRACKER_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+#include <map>
+#include <queue>
+#include <set>
+
+namespace llvm {
+  class ConstantInt;
+  class MachineRegisterInfo;
+  class MachineBasicBlock;
+  class MachineInstr;
+  class MachineOperand;
+  class raw_ostream;
+
+struct BitTracker {
+  struct BitRef;
+  struct RegisterRef;
+  struct BitValue;
+  struct BitMask;
+  struct RegisterCell;
+  struct MachineEvaluator;
+
+  typedef SetVector<const MachineBasicBlock *> BranchTargetList;
+
+  struct CellMapType : public std::map<unsigned,RegisterCell> {
+    bool has(unsigned Reg) const;
+  };
+
+  BitTracker(const MachineEvaluator &E, MachineFunction &F);
+  ~BitTracker();
+
+  void run();
+  void trace(bool On = false) { Trace = On; }
+  bool has(unsigned Reg) const;
+  const RegisterCell &lookup(unsigned Reg) const;
+  RegisterCell get(RegisterRef RR) const;
+  void put(RegisterRef RR, const RegisterCell &RC);
+  void subst(RegisterRef OldRR, RegisterRef NewRR);
+  bool reached(const MachineBasicBlock *B) const;
+
+private:
+  void visitPHI(const MachineInstr *PI);
+  void visitNonBranch(const MachineInstr *MI);
+  void visitBranchesFrom(const MachineInstr *BI);
+  void visitUsesOf(unsigned Reg);
+  void reset();
+
+  typedef std::pair<int,int> CFGEdge;
+  typedef std::set<CFGEdge> EdgeSetType;
+  typedef std::set<const MachineInstr *> InstrSetType;
+  typedef std::queue<CFGEdge> EdgeQueueType;
+
+  EdgeSetType EdgeExec;       // Executable flow graph edges.
+  InstrSetType InstrExec;     // Executable instructions.
+  EdgeQueueType FlowQ;        // Work queue of CFG edges.
+  bool Trace;                 // Enable tracing for debugging.
+
+  const MachineEvaluator &ME;
+  MachineFunction &MF;
+  MachineRegisterInfo &MRI;
+  CellMapType &Map;
+};
+
+
+// Abstraction of a reference to bit at position Pos from a register Reg.
+struct BitTracker::BitRef {
+  BitRef(unsigned R = 0, uint16_t P = 0) : Reg(R), Pos(P) {}
+  BitRef(const BitRef &BR) : Reg(BR.Reg), Pos(BR.Pos) {}
+  bool operator== (const BitRef &BR) const {
+    // If Reg is 0, disregard Pos.
+    return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos);
+  }
+  unsigned Reg;
+  uint16_t Pos;
+};
+
+
+// Abstraction of a register reference in MachineOperand.  It contains the
+// register number and the subregister index.
+struct BitTracker::RegisterRef {
+  RegisterRef(unsigned R = 0, unsigned S = 0)
+    : Reg(R), Sub(S) {}
+  RegisterRef(const MachineOperand &MO)
+      : Reg(MO.getReg()), Sub(MO.getSubReg()) {}
+  unsigned Reg, Sub;
+};
+
+
+// Value that a single bit can take.  This is outside of the context of
+// any register, it is more of an abstraction of the two-element set of
+// possible bit values.  One extension here is the "Ref" type, which
+// indicates that this bit takes the same value as the bit described by
+// RefInfo.
+struct BitTracker::BitValue {
+  enum ValueType {
+    Top,    // Bit not yet defined.
+    Zero,   // Bit = 0.
+    One,    // Bit = 1.
+    Ref     // Bit value same as the one described in RefI.
+    // Conceptually, there is no explicit "bottom" value: the lattice's
+    // bottom will be expressed as a "ref to itself", which, in the context
+    // of registers, could be read as "this value of this bit is defined by
+    // this bit".
+    // The ordering is:
+    //   x <= Top,
+    //   Self <= x, where "Self" is "ref to itself".
+    // This makes the value lattice different for each virtual register
+    // (even for each bit in the same virtual register), since the "bottom"
+    // for one register will be a simple "ref" for another register.
+    // Since we do not store the "Self" bit and register number, the meet
+    // operation will need to take it as a parameter.
+    //
+    // In practice there is a special case for values that are not associa-
+    // ted with any specific virtual register. An example would be a value
+    // corresponding to a bit of a physical register, or an intermediate
+    // value obtained in some computation (such as instruction evaluation).
+    // Such cases are identical to the usual Ref type, but the register
+    // number is 0. In such case the Pos field of the reference is ignored.
+    //
+    // What is worthy of notice is that in value V (that is a "ref"), as long
+    // as the RefI.Reg is not 0, it may actually be the same register as the
+    // one in which V will be contained.  If the RefI.Pos refers to the posi-
+    // tion of V, then V is assumed to be "bottom" (as a "ref to itself"),
+    // otherwise V is taken to be identical to the referenced bit of the
+    // same register.
+    // If RefI.Reg is 0, however, such a reference to the same register is
+    // not possible.  Any value V that is a "ref", and whose RefI.Reg is 0
+    // is treated as "bottom".
+  };
+  ValueType Type;
+  BitRef RefI;
+
+  BitValue(ValueType T = Top) : Type(T) {}
+  BitValue(bool B) : Type(B ? One : Zero) {}
+  BitValue(const BitValue &V) : Type(V.Type), RefI(V.RefI) {}
+  BitValue(unsigned Reg, uint16_t Pos) : Type(Ref), RefI(Reg, Pos) {}
+
+  bool operator== (const BitValue &V) const {
+    if (Type != V.Type)
+      return false;
+    if (Type == Ref && !(RefI == V.RefI))
+      return false;
+    return true;
+  }
+  bool operator!= (const BitValue &V) const {
+    return !operator==(V);
+  }
+  bool is(unsigned T) const {
+    assert(T == 0 || T == 1);
+    return T == 0 ? Type == Zero
+                  : (T == 1 ? Type == One : false);
+  }
+
+  // The "meet" operation is the "." operation in a semilattice (L, ., T, B):
+  // (1)  x.x = x
+  // (2)  x.y = y.x
+  // (3)  x.(y.z) = (x.y).z
+  // (4)  x.T = x  (i.e. T = "top")
+  // (5)  x.B = B  (i.e. B = "bottom")
+  //
+  // This "meet" function will update the value of the "*this" object with
+  // the newly calculated one, and return "true" if the value of *this has
+  // changed, and "false" otherwise.
+  // To prove that it satisfies the conditions (1)-(5), it is sufficient
+  // to show that a relation
+  //   x <= y  <=>  x.y = x
+  // defines a partial order (i.e. that "meet" is same as "infimum").
+  bool meet(const BitValue &V, const BitRef &Self) {
+    // First, check the cases where there is nothing to be done.
+    if (Type == Ref && RefI == Self)    // Bottom.meet(V) = Bottom (i.e. This)
+      return false;
+    if (V.Type == Top)                  // This.meet(Top) = This
+      return false;
+    if (*this == V)                     // This.meet(This) = This
+      return false;
+
+    // At this point, we know that the value of "this" will change.
+    // If it is Top, it will become the same as V, otherwise it will
+    // become "bottom" (i.e. Self).
+    if (Type == Top) {
+      Type = V.Type;
+      RefI = V.RefI;  // This may be irrelevant, but copy anyway.
+      return true;
+    }
+    // Become "bottom".
+    Type = Ref;
+    RefI = Self;
+    return true;
+  }
+
+  // Create a reference to the bit value V.
+  static BitValue ref(const BitValue &V);
+  // Create a "self".
+  static BitValue self(const BitRef &Self = BitRef());
+
+  bool num() const {
+    return Type == Zero || Type == One;
+  }
+  operator bool() const {
+    assert(Type == Zero || Type == One);
+    return Type == One;
+  }
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const BitValue &BV);
+};
+
+
+// This operation must be idempotent, i.e. ref(ref(V)) == ref(V).
+inline BitTracker::BitValue
+BitTracker::BitValue::ref(const BitValue &V) {
+  if (V.Type != Ref)
+    return BitValue(V.Type);
+  if (V.RefI.Reg != 0)
+    return BitValue(V.RefI.Reg, V.RefI.Pos);
+  return self();
+}
+
+
+inline BitTracker::BitValue
+BitTracker::BitValue::self(const BitRef &Self) {
+  return BitValue(Self.Reg, Self.Pos);
+}
+
+
+// A sequence of bits starting from index B up to and including index E.
+// If E < B, the mask represents two sections: [0..E] and [B..W) where
+// W is the width of the register.
+struct BitTracker::BitMask {
+  BitMask() : B(0), E(0) {}
+  BitMask(uint16_t b, uint16_t e) : B(b), E(e) {}
+  uint16_t first() const { return B; }
+  uint16_t last() const { return E; }
+private:
+  uint16_t B, E;
+};
+
+
+// Representation of a register: a list of BitValues.
+struct BitTracker::RegisterCell {
+  RegisterCell(uint16_t Width = DefaultBitN) : Bits(Width) {}
+
+  uint16_t width() const {
+    return Bits.size();
+  }
+  const BitValue &operator[](uint16_t BitN) const {
+    assert(BitN < Bits.size());
+    return Bits[BitN];
+  }
+  BitValue &operator[](uint16_t BitN) {
+    assert(BitN < Bits.size());
+    return Bits[BitN];
+  }
+
+  bool meet(const RegisterCell &RC, unsigned SelfR);
+  RegisterCell &insert(const RegisterCell &RC, const BitMask &M);
+  RegisterCell extract(const BitMask &M) const;  // Returns a new cell.
+  RegisterCell &rol(uint16_t Sh);    // Rotate left.
+  RegisterCell &fill(uint16_t B, uint16_t E, const BitValue &V);
+  RegisterCell &cat(const RegisterCell &RC);  // Concatenate.
+  uint16_t cl(bool B) const;
+  uint16_t ct(bool B) const;
+
+  bool operator== (const RegisterCell &RC) const;
+  bool operator!= (const RegisterCell &RC) const {
+    return !operator==(RC);
+  }
+
+  const RegisterCell &operator=(const RegisterCell &RC) {
+    Bits = RC.Bits;
+    return *this;
+  }
+
+  // Generate a "ref" cell for the corresponding register. In the resulting
+  // cell each bit will be described as being the same as the corresponding
+  // bit in register Reg (i.e. the cell is "defined" by register Reg).
+  static RegisterCell self(unsigned Reg, uint16_t Width);
+  // Generate a "top" cell of given size.
+  static RegisterCell top(uint16_t Width);
+  // Generate a cell that is a "ref" to another cell.
+  static RegisterCell ref(const RegisterCell &C);
+
+private:
+  // The DefaultBitN is here only to avoid frequent reallocation of the
+  // memory in the vector.
+  static const unsigned DefaultBitN = 32;
+  typedef SmallVector<BitValue, DefaultBitN> BitValueList;
+  BitValueList Bits;
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const RegisterCell &RC);
+};
+
+
+inline bool BitTracker::has(unsigned Reg) const {
+  return Map.find(Reg) != Map.end();
+}
+
+
+inline const BitTracker::RegisterCell&
+BitTracker::lookup(unsigned Reg) const {
+  CellMapType::const_iterator F = Map.find(Reg);
+  assert(F != Map.end());
+  return F->second;
+}
+
+
+inline BitTracker::RegisterCell
+BitTracker::RegisterCell::self(unsigned Reg, uint16_t Width) {
+  RegisterCell RC(Width);
+  for (uint16_t i = 0; i < Width; ++i)
+    RC.Bits[i] = BitValue::self(BitRef(Reg, i));
+  return RC;
+}
+
+
+inline BitTracker::RegisterCell
+BitTracker::RegisterCell::top(uint16_t Width) {
+  RegisterCell RC(Width);
+  for (uint16_t i = 0; i < Width; ++i)
+    RC.Bits[i] = BitValue(BitValue::Top);
+  return RC;
+}
+
+
+inline BitTracker::RegisterCell
+BitTracker::RegisterCell::ref(const RegisterCell &C) {
+  uint16_t W = C.width();
+  RegisterCell RC(W);
+  for (unsigned i = 0; i < W; ++i)
+    RC[i] = BitValue::ref(C[i]);
+  return RC;
+}
+
+
+inline bool BitTracker::CellMapType::has(unsigned Reg) const {
+  return find(Reg) != end();
+}
+
+// A class to evaluate target's instructions and update the cell maps.
+// This is used internally by the bit tracker.  A target that wants to
+// utilize this should implement the evaluation functions (noted below)
+// in a subclass of this class.
+struct BitTracker::MachineEvaluator {
+  MachineEvaluator(const TargetRegisterInfo &T, MachineRegisterInfo &M)
+      : TRI(T), MRI(M) {}
+  virtual ~MachineEvaluator() {}
+
+  uint16_t getRegBitWidth(const RegisterRef &RR) const;
+
+  RegisterCell getCell(const RegisterRef &RR, const CellMapType &M) const;
+  void putCell(const RegisterRef &RR, RegisterCell RC, CellMapType &M) const;
+  // A result of any operation should use refs to the source cells, not
+  // the cells directly. This function is a convenience wrapper to quickly
+  // generate a ref for a cell corresponding to a register reference.
+  RegisterCell getRef(const RegisterRef &RR, const CellMapType &M) const {
+    RegisterCell RC = getCell(RR, M);
+    return RegisterCell::ref(RC);
+  }
+
+  // Helper functions.
+  // Check if a cell is an immediate value (i.e. all bits are either 0 or 1).
+  bool isInt(const RegisterCell &A) const;
+  // Convert cell to an immediate value.
+  uint64_t toInt(const RegisterCell &A) const;
+
+  // Generate cell from an immediate value.
+  RegisterCell eIMM(int64_t V, uint16_t W) const;
+  RegisterCell eIMM(const ConstantInt *CI) const;
+
+  // Arithmetic.
+  RegisterCell eADD(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eSUB(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eMLS(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eMLU(const RegisterCell &A1, const RegisterCell &A2) const;
+
+  // Shifts.
+  RegisterCell eASL(const RegisterCell &A1, uint16_t Sh) const;
+  RegisterCell eLSR(const RegisterCell &A1, uint16_t Sh) const;
+  RegisterCell eASR(const RegisterCell &A1, uint16_t Sh) const;
+
+  // Logical.
+  RegisterCell eAND(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eORL(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eXOR(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eNOT(const RegisterCell &A1) const;
+
+  // Set bit, clear bit.
+  RegisterCell eSET(const RegisterCell &A1, uint16_t BitN) const;
+  RegisterCell eCLR(const RegisterCell &A1, uint16_t BitN) const;
+
+  // Count leading/trailing bits (zeros/ones).
+  RegisterCell eCLB(const RegisterCell &A1, bool B, uint16_t W) const;
+  RegisterCell eCTB(const RegisterCell &A1, bool B, uint16_t W) const;
+
+  // Sign/zero extension.
+  RegisterCell eSXT(const RegisterCell &A1, uint16_t FromN) const;
+  RegisterCell eZXT(const RegisterCell &A1, uint16_t FromN) const;
+
+  // Extract/insert
+  // XTR R,b,e:  extract bits from A1 starting at bit b, ending at e-1.
+  // INS R,S,b:  take R and replace bits starting from b with S.
+  RegisterCell eXTR(const RegisterCell &A1, uint16_t B, uint16_t E) const;
+  RegisterCell eINS(const RegisterCell &A1, const RegisterCell &A2,
+                    uint16_t AtN) const;
+
+  // User-provided functions for individual targets:
+
+  // Return a sub-register mask that indicates which bits in Reg belong
+  // to the subregister Sub. These bits are assumed to be contiguous in
+  // the super-register, and have the same ordering in the sub-register
+  // as in the super-register. It is valid to call this function with
+  // Sub == 0, in this case, the function should return a mask that spans
+  // the entire register Reg (which is what the default implementation
+  // does).
+  virtual BitMask mask(unsigned Reg, unsigned Sub) const;
+  // Indicate whether a given register class should be tracked.
+  virtual bool track(const TargetRegisterClass *RC) const { return true; }
+  // Evaluate a non-branching machine instruction, given the cell map with
+  // the input values. Place the results in the Outputs map. Return "true"
+  // if evaluation succeeded, "false" otherwise.
+  virtual bool evaluate(const MachineInstr *MI, const CellMapType &Inputs,
+                        CellMapType &Outputs) const;
+  // Evaluate a branch, given the cell map with the input values. Fill out
+  // a list of all possible branch targets and indicate (through a flag)
+  // whether the branch could fall-through. Return "true" if this information
+  // has been successfully computed, "false" otherwise.
+  virtual bool evaluate(const MachineInstr *BI, const CellMapType &Inputs,
+                        BranchTargetList &Targets, bool &FallsThru) const = 0;
+
+  const TargetRegisterInfo &TRI;
+  MachineRegisterInfo &MRI;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 758ccc741007..7ab2f0ba01df 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -12,13 +12,19 @@ tablegen(LLVM HexagonGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(HexagonCommonTableGen)
 
 add_llvm_target(HexagonCodeGen
+  BitTracker.cpp
   HexagonAsmPrinter.cpp
+  HexagonBitTracker.cpp
   HexagonCFGOptimizer.cpp
+  HexagonCommonGEP.cpp
   HexagonCopyToCombine.cpp
   HexagonExpandCondsets.cpp
   HexagonExpandPredSpillCode.cpp
   HexagonFixupHwLoops.cpp
   HexagonFrameLowering.cpp
+  HexagonGenExtract.cpp
+  HexagonGenInsert.cpp
+  HexagonGenPredicate.cpp
   HexagonHardwareLoops.cpp
   HexagonInstrInfo.cpp
   HexagonISelDAGToDAG.cpp
diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp
new file mode 100644
index 000000000000..021e58a1d08a
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -0,0 +1,1174 @@
+//===--- HexagonBitTracker.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonBitTracker.h"
+
+using namespace llvm;
+
+typedef BitTracker BT;
+
+HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
+                                   MachineRegisterInfo &mri,
+                                   const HexagonInstrInfo &tii,
+                                   MachineFunction &mf)
+    : MachineEvaluator(tri, mri), MF(mf), MFI(*mf.getFrameInfo()), TII(tii) {
+  // Populate the VRX map (VR to extension-type).
+  // Go over all the formal parameters of the function. If a given parameter
+  // P is sign- or zero-extended, locate the virtual register holding that
+  // parameter and create an entry in the VRX map indicating the type of ex-
+  // tension (and the source type).
+  // This is a bit complicated to do accurately, since the memory layout in-
+  // formation is necessary to precisely determine whether an aggregate para-
+  // meter will be passed in a register or in memory. What is given in MRI
+  // is the association between the physical register that is live-in (i.e.
+  // holds an argument), and the virtual register that this value will be
+  // copied into. This, by itself, is not sufficient to map back the virtual
+  // register to a formal parameter from Function (since consecutive live-ins
+  // from MRI may not correspond to consecutive formal parameters from Func-
+  // tion). To avoid the complications with in-memory arguments, only consi-
+  // der the initial sequence of formal parameters that are known to be
+  // passed via registers.
+  unsigned AttrIdx = 0;
+  unsigned InVirtReg, InPhysReg = 0;
+  const Function &F = *MF.getFunction();
+  typedef Function::const_arg_iterator arg_iterator;
+  for (arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+    AttrIdx++;
+    const Argument &Arg = *I;
+    Type *ATy = Arg.getType();
+    unsigned Width = 0;
+    if (ATy->isIntegerTy())
+      Width = ATy->getIntegerBitWidth();
+    else if (ATy->isPointerTy())
+      Width = 32;
+    // If pointer size is not set through target data, it will default to
+    // Module::AnyPointerSize.
+    if (Width == 0 || Width > 64)
+      break;
+    InPhysReg = getNextPhysReg(InPhysReg, Width);
+    if (!InPhysReg)
+      break;
+    InVirtReg = getVirtRegFor(InPhysReg);
+    if (!InVirtReg)
+      continue;
+    AttributeSet Attrs = F.getAttributes();
+    if (Attrs.hasAttribute(AttrIdx, Attribute::SExt))
+      VRX.insert(std::make_pair(InVirtReg, ExtType(ExtType::SExt, Width)));
+    else if (Attrs.hasAttribute(AttrIdx, Attribute::ZExt))
+      VRX.insert(std::make_pair(InVirtReg, ExtType(ExtType::ZExt, Width)));
+  }
+}
+
+
+BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
+  if (Sub == 0)
+    return MachineEvaluator::mask(Reg, 0);
+  using namespace Hexagon;
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  unsigned ID = RC->getID();
+  uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub));
+  switch (ID) {
+    case DoubleRegsRegClassID:
+      return (Sub == subreg_loreg) ? BT::BitMask(0, RW-1)
+                                   : BT::BitMask(RW, 2*RW-1);
+    default:
+      break;
+  }
+#ifndef NDEBUG
+  dbgs() << PrintReg(Reg, &TRI, Sub) << '\n';
+#endif
+  llvm_unreachable("Unexpected register/subregister");
+}
+
+
+namespace {
+  struct RegisterRefs : public std::vector<BT::RegisterRef> {
+    typedef std::vector<BT::RegisterRef> Base;
+    RegisterRefs(const MachineInstr *MI);
+    const BT::RegisterRef &operator[](unsigned n) const {
+      // The main purpose of this operator is to assert with bad argument.
+      assert(n < size());
+      return Base::operator[](n);
+    }
+  };
+
+  RegisterRefs::RegisterRefs(const MachineInstr *MI)
+    : Base(MI->getNumOperands()) {
+    for (unsigned i = 0, n = size(); i < n; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg())
+        at(i) = BT::RegisterRef(MO);
+      // For indices that don't correspond to registers, the entry will
+      // remain constructed via the default constructor.
+    }
+  }
+}
+
+
+bool HexagonEvaluator::evaluate(const MachineInstr *MI,
+      const CellMapType &Inputs, CellMapType &Outputs) const {
+  unsigned NumDefs = 0;
+
+  // Sanity verification: there should not be any defs with subregisters.
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    NumDefs++;
+    assert(MO.getSubReg() == 0);
+  }
+
+  if (NumDefs == 0)
+    return false;
+
+  if (MI->mayLoad())
+    return evaluateLoad(MI, Inputs, Outputs);
+
+  // Check COPY instructions that copy formal parameters into virtual
+  // registers. Such parameters can be sign- or zero-extended at the
+  // call site, and we should take advantage of this knowledge. The MRI
+  // keeps a list of pairs of live-in physical and virtual registers,
+  // which provides information about which virtual registers will hold
+  // the argument values. The function will still contain instructions
+  // defining those virtual registers, and in practice those are COPY
+  // instructions from a physical to a virtual register. In such cases,
+  // applying the argument extension to the virtual register can be seen
+  // as simply mirroring the extension that had already been applied to
+  // the physical register at the call site. If the defining instruction
+  // was not a COPY, it would not be clear how to mirror that extension
+  // on the callee's side. For that reason, only check COPY instructions
+  // for potential extensions.
+  if (MI->isCopy()) {
+    if (evaluateFormalCopy(MI, Inputs, Outputs))
+      return true;
+  }
+
+  // Beyond this point, if any operand is a global, skip that instruction.
+  // The reason is that certain instructions that can take an immediate
+  // operand can also have a global symbol in that operand. To avoid
+  // checking what kind of operand a given instruction has individually
+  // for each instruction, do it here. Global symbols as operands gene-
+  // rally do not provide any useful information.
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isGlobal() || MO.isBlockAddress() || MO.isSymbol() || MO.isJTI() ||
+        MO.isCPI())
+      return false;
+  }
+
+  RegisterRefs Reg(MI);
+  unsigned Opc = MI->getOpcode();
+  using namespace Hexagon;
+  #define op(i) MI->getOperand(i)
+  #define rc(i) RegisterCell::ref(getCell(Reg[i],Inputs))
+  #define im(i) MI->getOperand(i).getImm()
+
+  // If the instruction has no register operands, skip it.
+  if (Reg.size() == 0)
+    return false;
+
+  // Record result for register in operand 0.
+  auto rr0 = [this,Reg] (const BT::RegisterCell &Val, CellMapType &Outputs)
+        -> bool {
+    putCell(Reg[0], Val, Outputs);
+    return true;
+  };
+  // Get the cell corresponding to the N-th operand.
+  auto cop = [this,Reg,MI,Inputs] (unsigned N, uint16_t W)
+        -> BT::RegisterCell {
+    const MachineOperand &Op = MI->getOperand(N);
+    if (Op.isImm())
+      return eIMM(Op.getImm(), W);
+    if (!Op.isReg())
+      return RegisterCell::self(0, W);
+    assert(getRegBitWidth(Reg[N]) == W && "Register width mismatch");
+    return rc(N);
+  };
+  // Extract RW low bits of the cell.
+  auto lo = [this] (const BT::RegisterCell &RC, uint16_t RW)
+        -> BT::RegisterCell {
+    assert(RW <= RC.width());
+    return eXTR(RC, 0, RW);
+  };
+  // Extract RW high bits of the cell.
+  auto hi = [this] (const BT::RegisterCell &RC, uint16_t RW)
+        -> BT::RegisterCell {
+    uint16_t W = RC.width();
+    assert(RW <= W);
+    return eXTR(RC, W-RW, W);
+  };
+  // Extract N-th halfword (counting from the least significant position).
+  auto half = [this] (const BT::RegisterCell &RC, unsigned N)
+        -> BT::RegisterCell {
+    assert(N*16+16 <= RC.width());
+    return eXTR(RC, N*16, N*16+16);
+  };
+  // Shuffle bits (pick even/odd from cells and merge into result).
+  auto shuffle = [this] (const BT::RegisterCell &Rs, const BT::RegisterCell &Rt,
+                         uint16_t BW, bool Odd) -> BT::RegisterCell {
+    uint16_t I = Odd, Ws = Rs.width();
+    assert(Ws == Rt.width());
+    RegisterCell RC = eXTR(Rt, I*BW, I*BW+BW).cat(eXTR(Rs, I*BW, I*BW+BW));
+    I += 2;
+    while (I*BW < Ws) {
+      RC.cat(eXTR(Rt, I*BW, I*BW+BW)).cat(eXTR(Rs, I*BW, I*BW+BW));
+      I += 2;
+    }
+    return RC;
+  };
+
+  // The bitwidth of the 0th operand. In most (if not all) of the
+  // instructions below, the 0th operand is the defined register.
+  // Pre-compute the bitwidth here, because it is needed in many cases
+  // cases below.
+  uint16_t W0 = (Reg[0].Reg != 0) ? getRegBitWidth(Reg[0]) : 0;
+
+  switch (Opc) {
+    // Transfer immediate:
+
+    case A2_tfrsi:
+    case A2_tfrpi:
+    case CONST32:
+    case CONST32_Float_Real:
+    case CONST32_Int_Real:
+    case CONST64_Float_Real:
+    case CONST64_Int_Real:
+      return rr0(eIMM(im(1), W0), Outputs);
+    case TFR_PdFalse:
+      return rr0(RegisterCell(W0).fill(0, W0, BT::BitValue::Zero), Outputs);
+    case TFR_PdTrue:
+      return rr0(RegisterCell(W0).fill(0, W0, BT::BitValue::One), Outputs);
+    case TFR_FI: {
+      int FI = op(1).getIndex();
+      int Off = op(2).getImm();
+      unsigned A = MFI.getObjectAlignment(FI) + std::abs(Off);
+      unsigned L = Log2_32(A);
+      RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0);
+      RC.fill(0, L, BT::BitValue::Zero);
+      return rr0(RC, Outputs);
+    }
+
+    // Transfer register:
+
+    case A2_tfr:
+    case A2_tfrp:
+    case C2_pxfer_map:
+      return rr0(rc(1), Outputs);
+    case C2_tfrpr: {
+      uint16_t RW = W0;
+      uint16_t PW = 8; // XXX Pred size: getRegBitWidth(Reg[1]);
+      assert(PW <= RW);
+      RegisterCell PC = eXTR(rc(1), 0, PW);
+      RegisterCell RC = RegisterCell(RW).insert(PC, BT::BitMask(0, PW-1));
+      RC.fill(PW, RW, BT::BitValue::Zero);
+      return rr0(RC, Outputs);
+    }
+    case C2_tfrrp: {
+      RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0);
+      W0 = 8; // XXX Pred size
+      return rr0(eINS(RC, eXTR(rc(1), 0, W0), 0), Outputs);
+    }
+
+    // Arithmetic:
+
+    case A2_abs:
+    case A2_absp:
+      // TODO
+      break;
+
+    case A2_addsp: {
+      uint16_t W1 = getRegBitWidth(Reg[1]);
+      assert(W0 == 64 && W1 == 32);
+      RegisterCell CW = RegisterCell(W0).insert(rc(1), BT::BitMask(0, W1-1));
+      RegisterCell RC = eADD(eSXT(CW, W1), rc(2));
+      return rr0(RC, Outputs);
+    }
+    case A2_add:
+    case A2_addp:
+      return rr0(eADD(rc(1), rc(2)), Outputs);
+    case A2_addi:
+      return rr0(eADD(rc(1), eIMM(im(2), W0)), Outputs);
+    case S4_addi_asl_ri: {
+      RegisterCell RC = eADD(eIMM(im(1), W0), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_addi_lsr_ri: {
+      RegisterCell RC = eADD(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_addaddi: {
+      RegisterCell RC = eADD(rc(1), eADD(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyri_addi: {
+      RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+      RegisterCell RC = eADD(eIMM(im(1), W0), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyrr_addi: {
+      RegisterCell M = eMLS(rc(2), rc(3));
+      RegisterCell RC = eADD(eIMM(im(1), W0), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyri_addr_u2: {
+      RegisterCell M = eMLS(eIMM(im(2), W0), rc(3));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyri_addr: {
+      RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyrr_addr: {
+      RegisterCell M = eMLS(rc(2), rc(3));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case S4_subaddi: {
+      RegisterCell RC = eADD(rc(1), eSUB(eIMM(im(2), W0), rc(3)));
+      return rr0(RC, Outputs);
+    }
+    case M2_accii: {
+      RegisterCell RC = eADD(rc(1), eADD(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case M2_acci: {
+      RegisterCell RC = eADD(rc(1), eADD(rc(2), rc(3)));
+      return rr0(RC, Outputs);
+    }
+    case M2_subacc: {
+      RegisterCell RC = eADD(rc(1), eSUB(rc(2), rc(3)));
+      return rr0(RC, Outputs);
+    }
+    case S2_addasl_rrri: {
+      RegisterCell RC = eADD(rc(1), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case C4_addipc: {
+      RegisterCell RPC = RegisterCell::self(Reg[0].Reg, W0);
+      RPC.fill(0, 2, BT::BitValue::Zero);
+      return rr0(eADD(RPC, eIMM(im(2), W0)), Outputs);
+    }
+    case A2_sub:
+    case A2_subp:
+      return rr0(eSUB(rc(1), rc(2)), Outputs);
+    case A2_subri:
+      return rr0(eSUB(eIMM(im(1), W0), rc(2)), Outputs);
+    case S4_subi_asl_ri: {
+      RegisterCell RC = eSUB(eIMM(im(1), W0), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_subi_lsr_ri: {
+      RegisterCell RC = eSUB(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case M2_naccii: {
+      RegisterCell RC = eSUB(rc(1), eADD(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case M2_nacci: {
+      RegisterCell RC = eSUB(rc(1), eADD(rc(2), rc(3)));
+      return rr0(RC, Outputs);
+    }
+    // 32-bit negation is done by "Rd = A2_subri 0, Rs"
+    case A2_negp:
+      return rr0(eSUB(eIMM(0, W0), rc(1)), Outputs);
+
+    case M2_mpy_up: {
+      RegisterCell M = eMLS(rc(1), rc(2));
+      return rr0(hi(M, W0), Outputs);
+    }
+    case M2_dpmpyss_s0:
+      return rr0(eMLS(rc(1), rc(2)), Outputs);
+    case M2_dpmpyss_acc_s0:
+      return rr0(eADD(rc(1), eMLS(rc(2), rc(3))), Outputs);
+    case M2_dpmpyss_nac_s0:
+      return rr0(eSUB(rc(1), eMLS(rc(2), rc(3))), Outputs);
+    case M2_mpyi: {
+      RegisterCell M = eMLS(rc(1), rc(2));
+      return rr0(lo(M, W0), Outputs);
+    }
+    case M2_macsip: {
+      RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M2_macsin: {
+      RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+      RegisterCell RC = eSUB(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M2_maci: {
+      RegisterCell M = eMLS(rc(2), rc(3));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M2_mpysmi: {
+      RegisterCell M = eMLS(rc(1), eIMM(im(2), W0));
+      return rr0(lo(M, 32), Outputs);
+    }
+    case M2_mpysin: {
+      RegisterCell M = eMLS(rc(1), eIMM(-im(2), W0));
+      return rr0(lo(M, 32), Outputs);
+    }
+    case M2_mpysip: {
+      RegisterCell M = eMLS(rc(1), eIMM(im(2), W0));
+      return rr0(lo(M, 32), Outputs);
+    }
+    case M2_mpyu_up: {
+      RegisterCell M = eMLU(rc(1), rc(2));
+      return rr0(hi(M, W0), Outputs);
+    }
+    case M2_dpmpyuu_s0:
+      return rr0(eMLU(rc(1), rc(2)), Outputs);
+    case M2_dpmpyuu_acc_s0:
+      return rr0(eADD(rc(1), eMLU(rc(2), rc(3))), Outputs);
+    case M2_dpmpyuu_nac_s0:
+      return rr0(eSUB(rc(1), eMLU(rc(2), rc(3))), Outputs);
+    //case M2_mpysu_up:
+
+    // Logical/bitwise:
+
+    case A2_andir:
+      return rr0(eAND(rc(1), eIMM(im(2), W0)), Outputs);
+    case A2_and:
+    case A2_andp:
+      return rr0(eAND(rc(1), rc(2)), Outputs);
+    case A4_andn:
+    case A4_andnp:
+      return rr0(eAND(rc(1), eNOT(rc(2))), Outputs);
+    case S4_andi_asl_ri: {
+      RegisterCell RC = eAND(eIMM(im(1), W0), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_andi_lsr_ri: {
+      RegisterCell RC = eAND(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case M4_and_and:
+      return rr0(eAND(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case M4_and_andn:
+      return rr0(eAND(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case M4_and_or:
+      return rr0(eAND(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case M4_and_xor:
+      return rr0(eAND(rc(1), eXOR(rc(2), rc(3))), Outputs);
+    case A2_orir:
+      return rr0(eORL(rc(1), eIMM(im(2), W0)), Outputs);
+    case A2_or:
+    case A2_orp:
+      return rr0(eORL(rc(1), rc(2)), Outputs);
+    case A4_orn:
+    case A4_ornp:
+      return rr0(eORL(rc(1), eNOT(rc(2))), Outputs);
+    case S4_ori_asl_ri: {
+      RegisterCell RC = eORL(eIMM(im(1), W0), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_ori_lsr_ri: {
+      RegisterCell RC = eORL(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case M4_or_and:
+      return rr0(eORL(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case M4_or_andn:
+      return rr0(eORL(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case S4_or_andi:
+    case S4_or_andix: {
+      RegisterCell RC = eORL(rc(1), eAND(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case S4_or_ori: {
+      RegisterCell RC = eORL(rc(1), eORL(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case M4_or_or:
+      return rr0(eORL(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case M4_or_xor:
+      return rr0(eORL(rc(1), eXOR(rc(2), rc(3))), Outputs);
+    case A2_xor:
+    case A2_xorp:
+      return rr0(eXOR(rc(1), rc(2)), Outputs);
+    case M4_xor_and:
+      return rr0(eXOR(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case M4_xor_andn:
+      return rr0(eXOR(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case M4_xor_or:
+      return rr0(eXOR(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case M4_xor_xacc:
+      return rr0(eXOR(rc(1), eXOR(rc(2), rc(3))), Outputs);
+    case A2_not:
+    case A2_notp:
+      return rr0(eNOT(rc(1)), Outputs);
+
+    case S2_asl_i_r:
+    case S2_asl_i_p:
+      return rr0(eASL(rc(1), im(2)), Outputs);
+    case A2_aslh:
+      return rr0(eASL(rc(1), 16), Outputs);
+    case S2_asl_i_r_acc:
+    case S2_asl_i_p_acc:
+      return rr0(eADD(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_r_nac:
+    case S2_asl_i_p_nac:
+      return rr0(eSUB(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_r_and:
+    case S2_asl_i_p_and:
+      return rr0(eAND(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_r_or:
+    case S2_asl_i_p_or:
+      return rr0(eORL(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_r_xacc:
+    case S2_asl_i_p_xacc:
+      return rr0(eXOR(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_vh:
+    case S2_asl_i_vw:
+      // TODO
+      break;
+
+    case S2_asr_i_r:
+    case S2_asr_i_p:
+      return rr0(eASR(rc(1), im(2)), Outputs);
+    case A2_asrh:
+      return rr0(eASR(rc(1), 16), Outputs);
+    case S2_asr_i_r_acc:
+    case S2_asr_i_p_acc:
+      return rr0(eADD(rc(1), eASR(rc(2), im(3))), Outputs);
+    case S2_asr_i_r_nac:
+    case S2_asr_i_p_nac:
+      return rr0(eSUB(rc(1), eASR(rc(2), im(3))), Outputs);
+    case S2_asr_i_r_and:
+    case S2_asr_i_p_and:
+      return rr0(eAND(rc(1), eASR(rc(2), im(3))), Outputs);
+    case S2_asr_i_r_or:
+    case S2_asr_i_p_or:
+      return rr0(eORL(rc(1), eASR(rc(2), im(3))), Outputs);
+    case S2_asr_i_r_rnd: {
+      // The input is first sign-extended to 64 bits, then the output
+      // is truncated back to 32 bits.
+      assert(W0 == 32);
+      RegisterCell XC = eSXT(rc(1).cat(eIMM(0, W0)), W0);
+      RegisterCell RC = eASR(eADD(eASR(XC, im(2)), eIMM(1, 2*W0)), 1);
+      return rr0(eXTR(RC, 0, W0), Outputs);
+    }
+    case S2_asr_i_r_rnd_goodsyntax: {
+      int64_t S = im(2);
+      if (S == 0)
+        return rr0(rc(1), Outputs);
+      // Result: S2_asr_i_r_rnd Rs, u5-1
+      RegisterCell XC = eSXT(rc(1).cat(eIMM(0, W0)), W0);
+      RegisterCell RC = eLSR(eADD(eASR(XC, S-1), eIMM(1, 2*W0)), 1);
+      return rr0(eXTR(RC, 0, W0), Outputs);
+    }
+    case S2_asr_r_vh:
+    case S2_asr_i_vw:
+    case S2_asr_i_svw_trun:
+      // TODO
+      break;
+
+    case S2_lsr_i_r:
+    case S2_lsr_i_p:
+      return rr0(eLSR(rc(1), im(2)), Outputs);
+    case S2_lsr_i_r_acc:
+    case S2_lsr_i_p_acc:
+      return rr0(eADD(rc(1), eLSR(rc(2), im(3))), Outputs);
+    case S2_lsr_i_r_nac:
+    case S2_lsr_i_p_nac:
+      return rr0(eSUB(rc(1), eLSR(rc(2), im(3))), Outputs);
+    case S2_lsr_i_r_and:
+    case S2_lsr_i_p_and:
+      return rr0(eAND(rc(1), eLSR(rc(2), im(3))), Outputs);
+    case S2_lsr_i_r_or:
+    case S2_lsr_i_p_or:
+      return rr0(eORL(rc(1), eLSR(rc(2), im(3))), Outputs);
+    case S2_lsr_i_r_xacc:
+    case S2_lsr_i_p_xacc:
+      return rr0(eXOR(rc(1), eLSR(rc(2), im(3))), Outputs);
+
+    case S2_clrbit_i: {
+      RegisterCell RC = rc(1);
+      RC[im(2)] = BT::BitValue::Zero;
+      return rr0(RC, Outputs);
+    }
+    case S2_setbit_i: {
+      RegisterCell RC = rc(1);
+      RC[im(2)] = BT::BitValue::One;
+      return rr0(RC, Outputs);
+    }
+    case S2_togglebit_i: {
+      RegisterCell RC = rc(1);
+      uint16_t BX = im(2);
+      RC[BX] = RC[BX].is(0) ? BT::BitValue::One
+                            : RC[BX].is(1) ? BT::BitValue::Zero
+                                           : BT::BitValue::self();
+      return rr0(RC, Outputs);
+    }
+
+    case A4_bitspliti: {
+      uint16_t W1 = getRegBitWidth(Reg[1]);
+      uint16_t BX = im(2);
+      // Res.uw[1] = Rs[bx+1:], Res.uw[0] = Rs[0:bx]
+      const BT::BitValue Zero = BT::BitValue::Zero;
+      RegisterCell RZ = RegisterCell(W0).fill(BX, W1, Zero)
+                                        .fill(W1+(W1-BX), W0, Zero);
+      RegisterCell BF1 = eXTR(rc(1), 0, BX), BF2 = eXTR(rc(1), BX, W1);
+      RegisterCell RC = eINS(eINS(RZ, BF1, 0), BF2, W1);
+      return rr0(RC, Outputs);
+    }
+    case S4_extract:
+    case S4_extractp:
+    case S2_extractu:
+    case S2_extractup: {
+      uint16_t Wd = im(2), Of = im(3);
+      assert(Wd <= W0);
+      if (Wd == 0)
+        return rr0(eIMM(0, W0), Outputs);
+      // If the width extends beyond the register size, pad the register
+      // with 0 bits.
+      RegisterCell Pad = (Wd+Of > W0) ? rc(1).cat(eIMM(0, Wd+Of-W0)) : rc(1);
+      RegisterCell Ext = eXTR(Pad, Of, Wd+Of);
+      // Ext is short, need to extend it with 0s or sign bit.
+      RegisterCell RC = RegisterCell(W0).insert(Ext, BT::BitMask(0, Wd-1));
+      if (Opc == S2_extractu || Opc == S2_extractup)
+        return rr0(eZXT(RC, Wd), Outputs);
+      return rr0(eSXT(RC, Wd), Outputs);
+    }
+    case S2_insert:
+    case S2_insertp: {
+      uint16_t Wd = im(3), Of = im(4);
+      assert(Wd < W0 && Of < W0);
+      // If Wd+Of exceeds W0, the inserted bits are truncated.
+      if (Wd+Of > W0)
+        Wd = W0-Of;
+      if (Wd == 0)
+        return rr0(rc(1), Outputs);
+      return rr0(eINS(rc(1), eXTR(rc(2), 0, Wd), Of), Outputs);
+    }
+
+    // Bit permutations:
+
+    case A2_combineii:
+    case A4_combineii:
+    case A4_combineir:
+    case A4_combineri:
+    case A2_combinew:
+      assert(W0 % 2 == 0);
+      return rr0(cop(2, W0/2).cat(cop(1, W0/2)), Outputs);
+    case A2_combine_ll:
+    case A2_combine_lh:
+    case A2_combine_hl:
+    case A2_combine_hh: {
+      assert(W0 == 32);
+      assert(getRegBitWidth(Reg[1]) == 32 && getRegBitWidth(Reg[2]) == 32);
+      // Low half in the output is 0 for _ll and _hl, 1 otherwise:
+      unsigned LoH = !(Opc == A2_combine_ll || Opc == A2_combine_hl);
+      // High half in the output is 0 for _ll and _lh, 1 otherwise:
+      unsigned HiH = !(Opc == A2_combine_ll || Opc == A2_combine_lh);
+      RegisterCell R1 = rc(1);
+      RegisterCell R2 = rc(2);
+      RegisterCell RC = half(R2, LoH).cat(half(R1, HiH));
+      return rr0(RC, Outputs);
+    }
+    case S2_packhl: {
+      assert(W0 == 64);
+      assert(getRegBitWidth(Reg[1]) == 32 && getRegBitWidth(Reg[2]) == 32);
+      RegisterCell R1 = rc(1);
+      RegisterCell R2 = rc(2);
+      RegisterCell RC = half(R2, 0).cat(half(R1, 0)).cat(half(R2, 1))
+                                   .cat(half(R1, 1));
+      return rr0(RC, Outputs);
+    }
+    case S2_shuffeb: {
+      RegisterCell RC = shuffle(rc(1), rc(2), 8, false);
+      return rr0(RC, Outputs);
+    }
+    case S2_shuffeh: {
+      RegisterCell RC = shuffle(rc(1), rc(2), 16, false);
+      return rr0(RC, Outputs);
+    }
+    case S2_shuffob: {
+      RegisterCell RC = shuffle(rc(1), rc(2), 8, true);
+      return rr0(RC, Outputs);
+    }
+    case S2_shuffoh: {
+      RegisterCell RC = shuffle(rc(1), rc(2), 16, true);
+      return rr0(RC, Outputs);
+    }
+    case C2_mask: {
+      uint16_t WR = W0;
+      uint16_t WP = 8; // XXX Pred size: getRegBitWidth(Reg[1]);
+      assert(WR == 64 && WP == 8);
+      RegisterCell R1 = rc(1);
+      RegisterCell RC(WR);
+      for (uint16_t i = 0; i < WP; ++i) {
+        const BT::BitValue &V = R1[i];
+        BT::BitValue F = (V.is(0) || V.is(1)) ? V : BT::BitValue::self();
+        RC.fill(i*8, i*8+8, F);
+      }
+      return rr0(RC, Outputs);
+    }
+
+    // Mux:
+
+    case C2_muxii:
+    case C2_muxir:
+    case C2_muxri:
+    case C2_mux: {
+      BT::BitValue PC0 = rc(1)[0];
+      RegisterCell R2 = cop(2, W0);
+      RegisterCell R3 = cop(3, W0);
+      if (PC0.is(0) || PC0.is(1))
+        return rr0(RegisterCell::ref(PC0 ? R2 : R3), Outputs);
+      R2.meet(R3, Reg[0].Reg);
+      return rr0(R2, Outputs);
+    }
+    case C2_vmux:
+      // TODO
+      break;
+
+    // Sign- and zero-extension:
+
+    case A2_sxtb:
+      return rr0(eSXT(rc(1), 8), Outputs);
+    case A2_sxth:
+      return rr0(eSXT(rc(1), 16), Outputs);
+    case A2_sxtw: {
+      uint16_t W1 = getRegBitWidth(Reg[1]);
+      assert(W0 == 64 && W1 == 32);
+      RegisterCell RC = eSXT(rc(1).cat(eIMM(0, W1)), W1);
+      return rr0(RC, Outputs);
+    }
+    case A2_zxtb:
+      return rr0(eZXT(rc(1), 8), Outputs);
+    case A2_zxth:
+      return rr0(eZXT(rc(1), 16), Outputs);
+
+    // Bit count:
+
+    case S2_cl0:
+    case S2_cl0p:
+      // Always produce a 32-bit result.
+      return rr0(eCLB(rc(1), 0/*bit*/, 32), Outputs);
+    case S2_cl1:
+    case S2_cl1p:
+      return rr0(eCLB(rc(1), 1/*bit*/, 32), Outputs);
+    case S2_clb:
+    case S2_clbp: {
+      uint16_t W1 = getRegBitWidth(Reg[1]);
+      RegisterCell R1 = rc(1);
+      BT::BitValue TV = R1[W1-1];
+      if (TV.is(0) || TV.is(1))
+        return rr0(eCLB(R1, TV, 32), Outputs);
+      break;
+    }
+    case S2_ct0:
+    case S2_ct0p:
+      return rr0(eCTB(rc(1), 0/*bit*/, 32), Outputs);
+    case S2_ct1:
+    case S2_ct1p:
+      return rr0(eCTB(rc(1), 1/*bit*/, 32), Outputs);
+    case S5_popcountp:
+      // TODO
+      break;
+
+    case C2_all8: {
+      RegisterCell P1 = rc(1);
+      bool Has0 = false, All1 = true;
+      for (uint16_t i = 0; i < 8/*XXX*/; ++i) {
+        if (!P1[i].is(1))
+          All1 = false;
+        if (!P1[i].is(0))
+          continue;
+        Has0 = true;
+        break;
+      }
+      if (!Has0 && !All1)
+        break;
+      RegisterCell RC(W0);
+      RC.fill(0, W0, (All1 ? BT::BitValue::One : BT::BitValue::Zero));
+      return rr0(RC, Outputs);
+    }
+    case C2_any8: {
+      RegisterCell P1 = rc(1);
+      bool Has1 = false, All0 = true;
+      for (uint16_t i = 0; i < 8/*XXX*/; ++i) {
+        if (!P1[i].is(0))
+          All0 = false;
+        if (!P1[i].is(1))
+          continue;
+        Has1 = true;
+        break;
+      }
+      if (!Has1 && !All0)
+        break;
+      RegisterCell RC(W0);
+      RC.fill(0, W0, (Has1 ? BT::BitValue::One : BT::BitValue::Zero));
+      return rr0(RC, Outputs);
+    }
+    case C2_and:
+      return rr0(eAND(rc(1), rc(2)), Outputs);
+    case C2_andn:
+      return rr0(eAND(rc(1), eNOT(rc(2))), Outputs);
+    case C2_not:
+      return rr0(eNOT(rc(1)), Outputs);
+    case C2_or:
+      return rr0(eORL(rc(1), rc(2)), Outputs);
+    case C2_orn:
+      return rr0(eORL(rc(1), eNOT(rc(2))), Outputs);
+    case C2_xor:
+      return rr0(eXOR(rc(1), rc(2)), Outputs);
+    case C4_and_and:
+      return rr0(eAND(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case C4_and_andn:
+      return rr0(eAND(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case C4_and_or:
+      return rr0(eAND(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case C4_and_orn:
+      return rr0(eAND(rc(1), eORL(rc(2), eNOT(rc(3)))), Outputs);
+    case C4_or_and:
+      return rr0(eORL(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case C4_or_andn:
+      return rr0(eORL(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case C4_or_or:
+      return rr0(eORL(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case C4_or_orn:
+      return rr0(eORL(rc(1), eORL(rc(2), eNOT(rc(3)))), Outputs);
+    case C2_bitsclr:
+    case C2_bitsclri:
+    case C2_bitsset:
+    case C4_nbitsclr:
+    case C4_nbitsclri:
+    case C4_nbitsset:
+      // TODO
+      break;
+    case S2_tstbit_i:
+    case S4_ntstbit_i: {
+      BT::BitValue V = rc(1)[im(2)];
+      if (V.is(0) || V.is(1)) {
+        // If instruction is S2_tstbit_i, test for 1, otherwise test for 0.
+        bool TV = (Opc == S2_tstbit_i);
+        BT::BitValue F = V.is(TV) ? BT::BitValue::One : BT::BitValue::Zero;
+        return rr0(RegisterCell(W0).fill(0, W0, F), Outputs);
+      }
+      break;
+    }
+
+    default:
+      return MachineEvaluator::evaluate(MI, Inputs, Outputs);
+  }
+  #undef im
+  #undef rc
+  #undef op
+  return false;
+}
+
+
+bool HexagonEvaluator::evaluate(const MachineInstr *BI,
+      const CellMapType &Inputs, BranchTargetList &Targets,
+      bool &FallsThru) const {
+  // We need to evaluate one branch at a time. TII::AnalyzeBranch checks
+  // all the branches in a basic block at once, so we cannot use it.
+  unsigned Opc = BI->getOpcode();
+  bool SimpleBranch = false;
+  bool Negated = false;
+  switch (Opc) {
+    case Hexagon::J2_jumpf:
+    case Hexagon::J2_jumpfnew:
+    case Hexagon::J2_jumpfnewpt:
+      Negated = true;
+    case Hexagon::J2_jumpt:
+    case Hexagon::J2_jumptnew:
+    case Hexagon::J2_jumptnewpt:
+      // Simple branch:  if([!]Pn) jump ...
+      // i.e. Op0 = predicate, Op1 = branch target.
+      SimpleBranch = true;
+      break;
+    case Hexagon::J2_jump:
+      Targets.insert(BI->getOperand(0).getMBB());
+      FallsThru = false;
+      return true;
+    default:
+      // If the branch is of unknown type, assume that all successors are
+      // executable.
+      return false;
+  }
+
+  if (!SimpleBranch)
+    return false;
+
+  // BI is a conditional branch if we got here.
+  RegisterRef PR = BI->getOperand(0);
+  RegisterCell PC = getCell(PR, Inputs);
+  const BT::BitValue &Test = PC[0];
+
+  // If the condition is neither true nor false, then it's unknown.
+  if (!Test.is(0) && !Test.is(1))
+    return false;
+
+  // "Test.is(!Negated)" means "branch condition is true".
+  if (!Test.is(!Negated)) {
+    // Condition known to be false.
+    FallsThru = true;
+    return true;
+  }
+
+  Targets.insert(BI->getOperand(1).getMBB());
+  FallsThru = false;
+  return true;
+}
+
+
+bool HexagonEvaluator::evaluateLoad(const MachineInstr *MI,
+      const CellMapType &Inputs, CellMapType &Outputs) const {
+  if (TII.isPredicated(MI))
+    return false;
+  assert(MI->mayLoad() && "A load that mayn't?");
+  unsigned Opc = MI->getOpcode();
+
+  uint16_t BitNum;
+  bool SignEx;
+  using namespace Hexagon;
+
+  switch (Opc) {
+    default:
+      return false;
+
+#if 0
+    // memb_fifo
+    case L2_loadalignb_pbr:
+    case L2_loadalignb_pcr:
+    case L2_loadalignb_pi:
+    // memh_fifo
+    case L2_loadalignh_pbr:
+    case L2_loadalignh_pcr:
+    case L2_loadalignh_pi:
+    // membh
+    case L2_loadbsw2_pbr:
+    case L2_loadbsw2_pci:
+    case L2_loadbsw2_pcr:
+    case L2_loadbsw2_pi:
+    case L2_loadbsw4_pbr:
+    case L2_loadbsw4_pci:
+    case L2_loadbsw4_pcr:
+    case L2_loadbsw4_pi:
+    // memubh
+    case L2_loadbzw2_pbr:
+    case L2_loadbzw2_pci:
+    case L2_loadbzw2_pcr:
+    case L2_loadbzw2_pi:
+    case L2_loadbzw4_pbr:
+    case L2_loadbzw4_pci:
+    case L2_loadbzw4_pcr:
+    case L2_loadbzw4_pi:
+#endif
+
+    case L2_loadrbgp:
+    case L2_loadrb_io:
+    case L2_loadrb_pbr:
+    case L2_loadrb_pci:
+    case L2_loadrb_pcr:
+    case L2_loadrb_pi:
+    case L4_loadrb_abs:
+    case L4_loadrb_ap:
+    case L4_loadrb_rr:
+    case L4_loadrb_ur:
+      BitNum = 8;
+      SignEx = true;
+      break;
+
+    case L2_loadrubgp:
+    case L2_loadrub_io:
+    case L2_loadrub_pbr:
+    case L2_loadrub_pci:
+    case L2_loadrub_pcr:
+    case L2_loadrub_pi:
+    case L4_loadrub_abs:
+    case L4_loadrub_ap:
+    case L4_loadrub_rr:
+    case L4_loadrub_ur:
+      BitNum = 8;
+      SignEx = false;
+      break;
+
+    case L2_loadrhgp:
+    case L2_loadrh_io:
+    case L2_loadrh_pbr:
+    case L2_loadrh_pci:
+    case L2_loadrh_pcr:
+    case L2_loadrh_pi:
+    case L4_loadrh_abs:
+    case L4_loadrh_ap:
+    case L4_loadrh_rr:
+    case L4_loadrh_ur:
+      BitNum = 16;
+      SignEx = true;
+      break;
+
+    case L2_loadruhgp:
+    case L2_loadruh_io:
+    case L2_loadruh_pbr:
+    case L2_loadruh_pci:
+    case L2_loadruh_pcr:
+    case L2_loadruh_pi:
+    case L4_loadruh_rr:
+    case L4_loadruh_abs:
+    case L4_loadruh_ap:
+    case L4_loadruh_ur:
+      BitNum = 16;
+      SignEx = false;
+      break;
+
+    case L2_loadrigp:
+    case L2_loadri_io:
+    case L2_loadri_pbr:
+    case L2_loadri_pci:
+    case L2_loadri_pcr:
+    case L2_loadri_pi:
+    case L2_loadw_locked:
+    case L4_loadri_abs:
+    case L4_loadri_ap:
+    case L4_loadri_rr:
+    case L4_loadri_ur:
+    case LDriw_pred:
+      BitNum = 32;
+      SignEx = true;
+      break;
+
+    case L2_loadrdgp:
+    case L2_loadrd_io:
+    case L2_loadrd_pbr:
+    case L2_loadrd_pci:
+    case L2_loadrd_pcr:
+    case L2_loadrd_pi:
+    case L4_loadd_locked:
+    case L4_loadrd_abs:
+    case L4_loadrd_ap:
+    case L4_loadrd_rr:
+    case L4_loadrd_ur:
+      BitNum = 64;
+      SignEx = true;
+      break;
+  }
+
+  const MachineOperand &MD = MI->getOperand(0);
+  assert(MD.isReg() && MD.isDef());
+  RegisterRef RD = MD;
+
+  uint16_t W = getRegBitWidth(RD);
+  assert(W >= BitNum && BitNum > 0);
+  RegisterCell Res(W);
+
+  for (uint16_t i = 0; i < BitNum; ++i)
+    Res[i] = BT::BitValue::self(BT::BitRef(RD.Reg, i));
+
+  if (SignEx) {
+    const BT::BitValue &Sign = Res[BitNum-1];
+    for (uint16_t i = BitNum; i < W; ++i)
+      Res[i] = BT::BitValue::ref(Sign);
+  } else {
+    for (uint16_t i = BitNum; i < W; ++i)
+      Res[i] = BT::BitValue::Zero;
+  }
+
+  putCell(RD, Res, Outputs);
+  return true;
+}
+
+
+bool HexagonEvaluator::evaluateFormalCopy(const MachineInstr *MI,
+      const CellMapType &Inputs, CellMapType &Outputs) const {
+  // If MI defines a formal parameter, but is not a copy (loads are handled
+  // in evaluateLoad), then it's not clear what to do.
+  assert(MI->isCopy());
+
+  RegisterRef RD = MI->getOperand(0);
+  RegisterRef RS = MI->getOperand(1);
+  assert(RD.Sub == 0);
+  if (!TargetRegisterInfo::isPhysicalRegister(RS.Reg))
+    return false;
+  RegExtMap::const_iterator F = VRX.find(RD.Reg);
+  if (F == VRX.end())
+    return false;
+
+  uint16_t EW = F->second.Width;
+  // Store RD's cell into the map. This will associate the cell with a virtual
+  // register, and make zero-/sign-extends possible (otherwise we would be ex-
+  // tending "self" bit values, which will have no effect, since "self" values
+  // cannot be references to anything).
+  putCell(RD, getCell(RS, Inputs), Outputs);
+
+  RegisterCell Res;
+  // Read RD's cell from the outputs instead of RS's cell from the inputs:
+  if (F->second.Type == ExtType::SExt)
+    Res = eSXT(getCell(RD, Outputs), EW);
+  else if (F->second.Type == ExtType::ZExt)
+    Res = eZXT(getCell(RD, Outputs), EW);
+
+  putCell(RD, Res, Outputs);
+  return true;
+}
+
+
+unsigned HexagonEvaluator::getNextPhysReg(unsigned PReg, unsigned Width) const {
+  using namespace Hexagon;
+  bool Is64 = DoubleRegsRegClass.contains(PReg);
+  assert(PReg == 0 || Is64 || IntRegsRegClass.contains(PReg));
+
+  static const unsigned Phys32[] = { R0, R1, R2, R3, R4, R5 };
+  static const unsigned Phys64[] = { D0, D1, D2 };
+  const unsigned Num32 = sizeof(Phys32)/sizeof(unsigned);
+  const unsigned Num64 = sizeof(Phys64)/sizeof(unsigned);
+
+  // Return the first parameter register of the required width.
+  if (PReg == 0)
+    return (Width <= 32) ? Phys32[0] : Phys64[0];
+
+  // Set Idx32, Idx64 in such a way that Idx+1 would give the index of the
+  // next register.
+  unsigned Idx32 = 0, Idx64 = 0;
+  if (!Is64) {
+    while (Idx32 < Num32) {
+      if (Phys32[Idx32] == PReg)
+        break;
+      Idx32++;
+    }
+    Idx64 = Idx32/2;
+  } else {
+    while (Idx64 < Num64) {
+      if (Phys64[Idx64] == PReg)
+        break;
+      Idx64++;
+    }
+    Idx32 = Idx64*2+1;
+  }
+
+  if (Width <= 32)
+    return (Idx32+1 < Num32) ? Phys32[Idx32+1] : 0;
+  return (Idx64+1 < Num64) ? Phys64[Idx64+1] : 0;
+}
+
+
+unsigned HexagonEvaluator::getVirtRegFor(unsigned PReg) const {
+  typedef MachineRegisterInfo::livein_iterator iterator;
+  for (iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) {
+    if (I->first == PReg)
+      return I->second;
+  }
+  return 0;
+}
diff --git a/lib/Target/Hexagon/HexagonBitTracker.h b/lib/Target/Hexagon/HexagonBitTracker.h
new file mode 100644
index 000000000000..897af2d71870
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonBitTracker.h
@@ -0,0 +1,64 @@
+//===--- HexagonBitTracker.h ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONBITTRACKER_H
+#define HEXAGONBITTRACKER_H
+
+#include "BitTracker.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+  class HexagonInstrInfo;
+  class HexagonRegisterInfo;
+
+struct HexagonEvaluator : public BitTracker::MachineEvaluator {
+  typedef BitTracker::CellMapType CellMapType;
+  typedef BitTracker::RegisterRef RegisterRef;
+  typedef BitTracker::RegisterCell RegisterCell;
+  typedef BitTracker::BranchTargetList BranchTargetList;
+
+  HexagonEvaluator(const HexagonRegisterInfo &tri, MachineRegisterInfo &mri,
+                   const HexagonInstrInfo &tii, MachineFunction &mf);
+
+  bool evaluate(const MachineInstr *MI, const CellMapType &Inputs,
+                CellMapType &Outputs) const override;
+  bool evaluate(const MachineInstr *BI, const CellMapType &Inputs,
+                BranchTargetList &Targets, bool &FallsThru) const override;
+
+  BitTracker::BitMask mask(unsigned Reg, unsigned Sub) const override;
+
+  MachineFunction &MF;
+  MachineFrameInfo &MFI;
+  const HexagonInstrInfo &TII;
+
+private:
+  bool evaluateLoad(const MachineInstr *MI, const CellMapType &Inputs,
+                    CellMapType &Outputs) const;
+  bool evaluateFormalCopy(const MachineInstr *MI, const CellMapType &Inputs,
+                          CellMapType &Outputs) const;
+
+  unsigned getNextPhysReg(unsigned PReg, unsigned Width) const;
+  unsigned getVirtRegFor(unsigned PReg) const;
+
+  // Type of formal parameter extension.
+  struct ExtType {
+    enum { SExt, ZExt };
+    char Type;
+    uint16_t Width;
+    ExtType() : Type(0), Width(0) {}
+    ExtType(char t, uint16_t w) : Type(t), Width(w) {}
+  };
+  // Map VR -> extension type.
+  typedef DenseMap<unsigned, ExtType> RegExtMap;
+  RegExtMap VRX;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonCommonGEP.cpp b/lib/Target/Hexagon/HexagonCommonGEP.cpp
new file mode 100644
index 000000000000..9f5fac156527
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -0,0 +1,1325 @@
+//===--- HexagonCommonGEP.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "commgep"
+
+#include "llvm/Pass.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "HexagonTargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<bool> OptSpeculate("commgep-speculate", cl::init(true),
+  cl::Hidden, cl::ZeroOrMore);
+
+static cl::opt<bool> OptEnableInv("commgep-inv", cl::init(true), cl::Hidden,
+  cl::ZeroOrMore);
+
+static cl::opt<bool> OptEnableConst("commgep-const", cl::init(true),
+  cl::Hidden, cl::ZeroOrMore);
+
+namespace llvm {
+  void initializeHexagonCommonGEPPass(PassRegistry&);
+}
+
+namespace {
+  struct GepNode;
+  typedef std::set<GepNode*> NodeSet;
+  typedef std::map<GepNode*,Value*> NodeToValueMap;
+  typedef std::vector<GepNode*> NodeVect;
+  typedef std::map<GepNode*,NodeVect> NodeChildrenMap;
+  typedef std::set<Use*> UseSet;
+  typedef std::map<GepNode*,UseSet> NodeToUsesMap;
+
+  // Numbering map for gep nodes. Used to keep track of ordering for
+  // gep nodes.
+  struct NodeNumbering : public std::map<const GepNode*,unsigned> {
+  };
+
+  struct NodeOrdering : public NodeNumbering {
+    NodeOrdering() : LastNum(0) {}
+#ifdef _MSC_VER
+    void special_insert_for_special_msvc(const GepNode *N)
+#else
+    using NodeNumbering::insert;
+    void insert(const GepNode* N)
+#endif
+    {
+      insert(std::make_pair(N, ++LastNum));
+    }
+    bool operator() (const GepNode* N1, const GepNode *N2) const {
+      const_iterator F1 = find(N1), F2 = find(N2);
+      assert(F1 != end() && F2 != end());
+      return F1->second < F2->second;
+    }
+  private:
+    unsigned LastNum;
+  };
+
+
+  class HexagonCommonGEP : public FunctionPass {
+  public:
+    static char ID;
+    HexagonCommonGEP() : FunctionPass(ID) {
+      initializeHexagonCommonGEPPass(*PassRegistry::getPassRegistry());
+    }
+    virtual bool runOnFunction(Function &F);
+    virtual const char *getPassName() const {
+      return "Hexagon Common GEP";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<PostDominatorTree>();
+      AU.addPreserved<PostDominatorTree>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    typedef std::map<Value*,GepNode*> ValueToNodeMap;
+    typedef std::vector<Value*> ValueVect;
+    typedef std::map<GepNode*,ValueVect> NodeToValuesMap;
+
+    void getBlockTraversalOrder(BasicBlock *Root, ValueVect &Order);
+    bool isHandledGepForm(GetElementPtrInst *GepI);
+    void processGepInst(GetElementPtrInst *GepI, ValueToNodeMap &NM);
+    void collect();
+    void common();
+
+    BasicBlock *recalculatePlacement(GepNode *Node, NodeChildrenMap &NCM,
+                                     NodeToValueMap &Loc);
+    BasicBlock *recalculatePlacementRec(GepNode *Node, NodeChildrenMap &NCM,
+                                        NodeToValueMap &Loc);
+    bool isInvariantIn(Value *Val, Loop *L);
+    bool isInvariantIn(GepNode *Node, Loop *L);
+    bool isInMainPath(BasicBlock *B, Loop *L);
+    BasicBlock *adjustForInvariance(GepNode *Node, NodeChildrenMap &NCM,
+                                    NodeToValueMap &Loc);
+    void separateChainForNode(GepNode *Node, Use *U, NodeToValueMap &Loc);
+    void separateConstantChains(GepNode *Node, NodeChildrenMap &NCM,
+                                NodeToValueMap &Loc);
+    void computeNodePlacement(NodeToValueMap &Loc);
+
+    Value *fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
+                        BasicBlock *LocB);
+    void getAllUsersForNode(GepNode *Node, ValueVect &Values,
+                            NodeChildrenMap &NCM);
+    void materialize(NodeToValueMap &Loc);
+
+    void removeDeadCode();
+
+    NodeVect Nodes;
+    NodeToUsesMap Uses;
+    NodeOrdering NodeOrder;   // Node ordering, for deterministic behavior.
+    SpecificBumpPtrAllocator<GepNode> *Mem;
+    LLVMContext *Ctx;
+    LoopInfo *LI;
+    DominatorTree *DT;
+    PostDominatorTree *PDT;
+    Function *Fn;
+  };
+}
+
+
+char HexagonCommonGEP::ID = 0;
+INITIALIZE_PASS_BEGIN(HexagonCommonGEP, "hcommgep", "Hexagon Common GEP",
+      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(HexagonCommonGEP, "hcommgep", "Hexagon Common GEP",
+      false, false)
+
+namespace {
+  struct GepNode {
+    enum {
+      None      = 0,
+      Root      = 0x01,
+      Internal  = 0x02,
+      Used      = 0x04
+    };
+
+    uint32_t Flags;
+    union {
+      GepNode *Parent;
+      Value *BaseVal;
+    };
+    Value *Idx;
+    Type *PTy;  // Type of the pointer operand.
+
+    GepNode() : Flags(0), Parent(0), Idx(0), PTy(0) {}
+    GepNode(const GepNode *N) : Flags(N->Flags), Idx(N->Idx), PTy(N->PTy) {
+      if (Flags & Root)
+        BaseVal = N->BaseVal;
+      else
+        Parent = N->Parent;
+    }
+    friend raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN);
+  };
+
+
+  Type *next_type(Type *Ty, Value *Idx) {
+    // Advance the type.
+    if (!Ty->isStructTy()) {
+      Type *NexTy = cast<SequentialType>(Ty)->getElementType();
+      return NexTy;
+    }
+    // Otherwise it is a struct type.
+    ConstantInt *CI = dyn_cast<ConstantInt>(Idx);
+    assert(CI && "Struct type with non-constant index");
+    int64_t i = CI->getValue().getSExtValue();
+    Type *NextTy = cast<StructType>(Ty)->getElementType(i);
+    return NextTy;
+  }
+
+
+  raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN) {
+    OS << "{ {";
+    bool Comma = false;
+    if (GN.Flags & GepNode::Root) {
+      OS << "root";
+      Comma = true;
+    }
+    if (GN.Flags & GepNode::Internal) {
+      if (Comma)
+        OS << ',';
+      OS << "internal";
+      Comma = true;
+    }
+    if (GN.Flags & GepNode::Used) {
+      if (Comma)
+        OS << ',';
+      OS << "used";
+      Comma = true;
+    }
+    OS << "} ";
+    if (GN.Flags & GepNode::Root)
+      OS << "BaseVal:" << GN.BaseVal->getName() << '(' << GN.BaseVal << ')';
+    else
+      OS << "Parent:" << GN.Parent;
+
+    OS << " Idx:";
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(GN.Idx))
+      OS << CI->getValue().getSExtValue();
+    else if (GN.Idx->hasName())
+      OS << GN.Idx->getName();
+    else
+      OS << "<anon> =" << *GN.Idx;
+
+    OS << " PTy:";
+    if (GN.PTy->isStructTy()) {
+      StructType *STy = cast<StructType>(GN.PTy);
+      if (!STy->isLiteral())
+        OS << GN.PTy->getStructName();
+      else
+        OS << "<anon-struct>:" << *STy;
+    }
+    else
+      OS << *GN.PTy;
+    OS << " }";
+    return OS;
+  }
+
+
+  template <typename NodeContainer>
+  void dump_node_container(raw_ostream &OS, const NodeContainer &S) {
+    typedef typename NodeContainer::const_iterator const_iterator;
+    for (const_iterator I = S.begin(), E = S.end(); I != E; ++I)
+      OS << *I << ' ' << **I << '\n';
+  }
+
+  raw_ostream &operator<< (raw_ostream &OS,
+                           const NodeVect &S) LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<< (raw_ostream &OS, const NodeVect &S) {
+    dump_node_container(OS, S);
+    return OS;
+  }
+
+
+  raw_ostream &operator<< (raw_ostream &OS,
+                           const NodeToUsesMap &M) LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<< (raw_ostream &OS, const NodeToUsesMap &M){
+    typedef NodeToUsesMap::const_iterator const_iterator;
+    for (const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
+      const UseSet &Us = I->second;
+      OS << I->first << " -> #" << Us.size() << '{';
+      for (UseSet::const_iterator J = Us.begin(), F = Us.end(); J != F; ++J) {
+        User *R = (*J)->getUser();
+        if (R->hasName())
+          OS << ' ' << R->getName();
+        else
+          OS << " <?>(" << *R << ')';
+      }
+      OS << " }\n";
+    }
+    return OS;
+  }
+
+
+  struct in_set {
+    in_set(const NodeSet &S) : NS(S) {}
+    bool operator() (GepNode *N) const {
+      return NS.find(N) != NS.end();
+    }
+  private:
+    const NodeSet &NS;
+  };
+}
+
+
+inline void *operator new(size_t, SpecificBumpPtrAllocator<GepNode> &A) {
+  return A.Allocate();
+}
+
+
+void HexagonCommonGEP::getBlockTraversalOrder(BasicBlock *Root,
+      ValueVect &Order) {
+  // Compute block ordering for a typical DT-based traversal of the flow
+  // graph: "before visiting a block, all of its dominators must have been
+  // visited".
+
+  Order.push_back(Root);
+  DomTreeNode *DTN = DT->getNode(Root);
+  typedef GraphTraits<DomTreeNode*> GTN;
+  typedef GTN::ChildIteratorType Iter;
+  for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
+    getBlockTraversalOrder((*I)->getBlock(), Order);
+}
+
+
+bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) {
+  // No vector GEPs.
+  if (!GepI->getType()->isPointerTy())
+    return false;
+  // No GEPs without any indices.  (Is this possible?)
+  if (GepI->idx_begin() == GepI->idx_end())
+    return false;
+  return true;
+}
+
+
+void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
+      ValueToNodeMap &NM) {
+  DEBUG(dbgs() << "Visiting GEP: " << *GepI << '\n');
+  GepNode *N = new (*Mem) GepNode;
+  Value *PtrOp = GepI->getPointerOperand();
+  ValueToNodeMap::iterator F = NM.find(PtrOp);
+  if (F == NM.end()) {
+    N->BaseVal = PtrOp;
+    N->Flags |= GepNode::Root;
+  } else {
+    // If PtrOp was a GEP instruction, it must have already been processed.
+    // The ValueToNodeMap entry for it is the last gep node in the generated
+    // chain. Link to it here.
+    N->Parent = F->second;
+  }
+  N->PTy = PtrOp->getType();
+  N->Idx = *GepI->idx_begin();
+
+  // Collect the list of users of this GEP instruction. Will add it to the
+  // last node created for it.
+  UseSet Us;
+  for (Value::user_iterator UI = GepI->user_begin(), UE = GepI->user_end();
+       UI != UE; ++UI) {
+    // Check if this gep is used by anything other than other geps that
+    // we will process.
+    if (isa<GetElementPtrInst>(*UI)) {
+      GetElementPtrInst *UserG = cast<GetElementPtrInst>(*UI);
+      if (isHandledGepForm(UserG))
+        continue;
+    }
+    Us.insert(&UI.getUse());
+  }
+  Nodes.push_back(N);
+#ifdef _MSC_VER
+  NodeOrder.special_insert_for_special_msvc(N);
+#else
+  NodeOrder.insert(N);
+#endif
+
+  // Skip the first index operand, since we only handle 0. This dereferences
+  // the pointer operand.
+  GepNode *PN = N;
+  Type *PtrTy = cast<PointerType>(PtrOp->getType())->getElementType();
+  for (User::op_iterator OI = GepI->idx_begin()+1, OE = GepI->idx_end();
+       OI != OE; ++OI) {
+    Value *Op = *OI;
+    GepNode *Nx = new (*Mem) GepNode;
+    Nx->Parent = PN;  // Link Nx to the previous node.
+    Nx->Flags |= GepNode::Internal;
+    Nx->PTy = PtrTy;
+    Nx->Idx = Op;
+    Nodes.push_back(Nx);
+#ifdef _MSC_VER
+    NodeOrder.special_insert_for_special_msvc(Nx);
+#else
+    NodeOrder.insert(Nx);
+#endif
+    PN = Nx;
+
+    PtrTy = next_type(PtrTy, Op);
+  }
+
+  // After last node has been created, update the use information.
+  if (!Us.empty()) {
+    PN->Flags |= GepNode::Used;
+    Uses[PN].insert(Us.begin(), Us.end());
+  }
+
+  // Link the last node with the originating GEP instruction. This is to
+  // help with linking chained GEP instructions.
+  NM.insert(std::make_pair(GepI, PN));
+}
+
+
+void HexagonCommonGEP::collect() {
+  // Establish depth-first traversal order of the dominator tree.
+  ValueVect BO;
+  getBlockTraversalOrder(Fn->begin(), BO);
+
+  // The creation of gep nodes requires DT-traversal. When processing a GEP
+  // instruction that uses another GEP instruction as the base pointer, the
+  // gep node for the base pointer should already exist.
+  ValueToNodeMap NM;
+  for (ValueVect::iterator I = BO.begin(), E = BO.end(); I != E; ++I) {
+    BasicBlock *B = cast<BasicBlock>(*I);
+    for (BasicBlock::iterator J = B->begin(), F = B->end(); J != F; ++J) {
+      if (!isa<GetElementPtrInst>(J))
+        continue;
+      GetElementPtrInst *GepI = cast<GetElementPtrInst>(J);
+      if (isHandledGepForm(GepI))
+        processGepInst(GepI, NM);
+    }
+  }
+
+  DEBUG(dbgs() << "Gep nodes after initial collection:\n" << Nodes);
+}
+
+
+namespace {
+  void invert_find_roots(const NodeVect &Nodes, NodeChildrenMap &NCM,
+        NodeVect &Roots) {
+    typedef NodeVect::const_iterator const_iterator;
+    for (const_iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+      GepNode *N = *I;
+      if (N->Flags & GepNode::Root) {
+        Roots.push_back(N);
+        continue;
+      }
+      GepNode *PN = N->Parent;
+      NCM[PN].push_back(N);
+    }
+  }
+
+  void nodes_for_root(GepNode *Root, NodeChildrenMap &NCM, NodeSet &Nodes) {
+    NodeVect Work;
+    Work.push_back(Root);
+    Nodes.insert(Root);
+
+    while (!Work.empty()) {
+      NodeVect::iterator First = Work.begin();
+      GepNode *N = *First;
+      Work.erase(First);
+      NodeChildrenMap::iterator CF = NCM.find(N);
+      if (CF != NCM.end()) {
+        Work.insert(Work.end(), CF->second.begin(), CF->second.end());
+        Nodes.insert(CF->second.begin(), CF->second.end());
+      }
+    }
+  }
+}
+
+
+namespace {
+  typedef std::set<NodeSet> NodeSymRel;
+  typedef std::pair<GepNode*,GepNode*> NodePair;
+  typedef std::set<NodePair> NodePairSet;
+
+  const NodeSet *node_class(GepNode *N, NodeSymRel &Rel) {
+    for (NodeSymRel::iterator I = Rel.begin(), E = Rel.end(); I != E; ++I)
+      if (I->count(N))
+        return &*I;
+    return 0;
+  }
+
+  // Create an ordered pair of GepNode pointers. The pair will be used in
+  // determining equality. The only purpose of the ordering is to eliminate
+  // duplication due to the commutativity of equality/non-equality.
+  NodePair node_pair(GepNode *N1, GepNode *N2) {
+    uintptr_t P1 = uintptr_t(N1), P2 = uintptr_t(N2);
+    if (P1 <= P2)
+      return std::make_pair(N1, N2);
+    return std::make_pair(N2, N1);
+  }
+
+  unsigned node_hash(GepNode *N) {
+    // Include everything except flags and parent.
+    FoldingSetNodeID ID;
+    ID.AddPointer(N->Idx);
+    ID.AddPointer(N->PTy);
+    return ID.ComputeHash();
+  }
+
+  bool node_eq(GepNode *N1, GepNode *N2, NodePairSet &Eq, NodePairSet &Ne) {
+    // Don't cache the result for nodes with different hashes. The hash
+    // comparison is fast enough.
+    if (node_hash(N1) != node_hash(N2))
+      return false;
+
+    NodePair NP = node_pair(N1, N2);
+    NodePairSet::iterator FEq = Eq.find(NP);
+    if (FEq != Eq.end())
+      return true;
+    NodePairSet::iterator FNe = Ne.find(NP);
+    if (FNe != Ne.end())
+      return false;
+    // Not previously compared.
+    bool Root1 = N1->Flags & GepNode::Root;
+    bool Root2 = N2->Flags & GepNode::Root;
+    NodePair P = node_pair(N1, N2);
+    // If the Root flag has different values, the nodes are different.
+    // If both nodes are root nodes, but their base pointers differ,
+    // they are different.
+    if (Root1 != Root2 || (Root1 && N1->BaseVal != N2->BaseVal)) {
+      Ne.insert(P);
+      return false;
+    }
+    // Here the root flags are identical, and for root nodes the
+    // base pointers are equal, so the root nodes are equal.
+    // For non-root nodes, compare their parent nodes.
+    if (Root1 || node_eq(N1->Parent, N2->Parent, Eq, Ne)) {
+      Eq.insert(P);
+      return true;
+    }
+    return false;
+  }
+}
+
+
+void HexagonCommonGEP::common() {
+  // The essence of this commoning is finding gep nodes that are equal.
+  // To do this we need to compare all pairs of nodes. To save time,
+  // first, partition the set of all nodes into sets of potentially equal
+  // nodes, and then compare pairs from within each partition.
+  typedef std::map<unsigned,NodeSet> NodeSetMap;
+  NodeSetMap MaybeEq;
+
+  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    GepNode *N = *I;
+    unsigned H = node_hash(N);
+    MaybeEq[H].insert(N);
+  }
+
+  // Compute the equivalence relation for the gep nodes.  Use two caches,
+  // one for equality and the other for non-equality.
+  NodeSymRel EqRel;  // Equality relation (as set of equivalence classes).
+  NodePairSet Eq, Ne;  // Caches.
+  for (NodeSetMap::iterator I = MaybeEq.begin(), E = MaybeEq.end();
+       I != E; ++I) {
+    NodeSet &S = I->second;
+    for (NodeSet::iterator NI = S.begin(), NE = S.end(); NI != NE; ++NI) {
+      GepNode *N = *NI;
+      // If node already has a class, then the class must have been created
+      // in a prior iteration of this loop. Since equality is transitive,
+      // nothing more will be added to that class, so skip it.
+      if (node_class(N, EqRel))
+        continue;
+
+      // Create a new class candidate now.
+      NodeSet C;
+      for (NodeSet::iterator NJ = std::next(NI); NJ != NE; ++NJ)
+        if (node_eq(N, *NJ, Eq, Ne))
+          C.insert(*NJ);
+      // If Tmp is empty, N would be the only element in it. Don't bother
+      // creating a class for it then.
+      if (!C.empty()) {
+        C.insert(N);  // Finalize the set before adding it to the relation.
+        std::pair<NodeSymRel::iterator, bool> Ins = EqRel.insert(C);
+        (void)Ins;
+        assert(Ins.second && "Cannot add a class");
+      }
+    }
+  }
+
+  DEBUG({
+    dbgs() << "Gep node equality:\n";
+    for (NodePairSet::iterator I = Eq.begin(), E = Eq.end(); I != E; ++I)
+      dbgs() << "{ " << I->first << ", " << I->second << " }\n";
+
+    dbgs() << "Gep equivalence classes:\n";
+    for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) {
+      dbgs() << '{';
+      const NodeSet &S = *I;
+      for (NodeSet::const_iterator J = S.begin(), F = S.end(); J != F; ++J) {
+        if (J != S.begin())
+          dbgs() << ',';
+        dbgs() << ' ' << *J;
+      }
+      dbgs() << " }\n";
+    }
+  });
+
+
+  // Create a projection from a NodeSet to the minimal element in it.
+  typedef std::map<const NodeSet*,GepNode*> ProjMap;
+  ProjMap PM;
+  for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) {
+    const NodeSet &S = *I;
+    GepNode *Min = *std::min_element(S.begin(), S.end(), NodeOrder);
+    std::pair<ProjMap::iterator,bool> Ins = PM.insert(std::make_pair(&S, Min));
+    (void)Ins;
+    assert(Ins.second && "Cannot add minimal element");
+
+    // Update the min element's flags, and user list.
+    uint32_t Flags = 0;
+    UseSet &MinUs = Uses[Min];
+    for (NodeSet::iterator J = S.begin(), F = S.end(); J != F; ++J) {
+      GepNode *N = *J;
+      uint32_t NF = N->Flags;
+      // If N is used, append all original values of N to the list of
+      // original values of Min.
+      if (NF & GepNode::Used)
+        MinUs.insert(Uses[N].begin(), Uses[N].end());
+      Flags |= NF;
+    }
+    if (MinUs.empty())
+      Uses.erase(Min);
+
+    // The collected flags should include all the flags from the min element.
+    assert((Min->Flags & Flags) == Min->Flags);
+    Min->Flags = Flags;
+  }
+
+  // Commoning: for each non-root gep node, replace "Parent" with the
+  // selected (minimum) node from the corresponding equivalence class.
+  // If a given parent does not have an equivalence class, leave it
+  // unchanged (it means that it's the only element in its class).
+  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    GepNode *N = *I;
+    if (N->Flags & GepNode::Root)
+      continue;
+    const NodeSet *PC = node_class(N->Parent, EqRel);
+    if (!PC)
+      continue;
+    ProjMap::iterator F = PM.find(PC);
+    if (F == PM.end())
+      continue;
+    // Found a replacement, use it.
+    GepNode *Rep = F->second;
+    N->Parent = Rep;
+  }
+
+  DEBUG(dbgs() << "Gep nodes after commoning:\n" << Nodes);
+
+  // Finally, erase the nodes that are no longer used.
+  NodeSet Erase;
+  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    GepNode *N = *I;
+    const NodeSet *PC = node_class(N, EqRel);
+    if (!PC)
+      continue;
+    ProjMap::iterator F = PM.find(PC);
+    if (F == PM.end())
+      continue;
+    if (N == F->second)
+      continue;
+    // Node for removal.
+    Erase.insert(*I);
+  }
+  NodeVect::iterator NewE = std::remove_if(Nodes.begin(), Nodes.end(),
+                                           in_set(Erase));
+  Nodes.resize(std::distance(Nodes.begin(), NewE));
+
+  DEBUG(dbgs() << "Gep nodes after post-commoning cleanup:\n" << Nodes);
+}
+
+
+namespace {
+  template <typename T>
+  BasicBlock *nearest_common_dominator(DominatorTree *DT, T &Blocks) {
+    DEBUG({
+      dbgs() << "NCD of {";
+      for (typename T::iterator I = Blocks.begin(), E = Blocks.end();
+           I != E; ++I) {
+        if (!*I)
+          continue;
+        BasicBlock *B = cast<BasicBlock>(*I);
+        dbgs() << ' ' << B->getName();
+      }
+      dbgs() << " }\n";
+    });
+
+    // Allow null basic blocks in Blocks.  In such cases, return 0.
+    typename T::iterator I = Blocks.begin(), E = Blocks.end();
+    if (I == E || !*I)
+      return 0;
+    BasicBlock *Dom = cast<BasicBlock>(*I);
+    while (++I != E) {
+      BasicBlock *B = cast_or_null<BasicBlock>(*I);
+      Dom = B ? DT->findNearestCommonDominator(Dom, B) : 0;
+      if (!Dom)
+        return 0;
+    }
+    DEBUG(dbgs() << "computed:" << Dom->getName() << '\n');
+    return Dom;
+  }
+
+  template <typename T>
+  BasicBlock *nearest_common_dominatee(DominatorTree *DT, T &Blocks) {
+    // If two blocks, A and B, dominate a block C, then A dominates B,
+    // or B dominates A.
+    typename T::iterator I = Blocks.begin(), E = Blocks.end();
+    // Find the first non-null block.
+    while (I != E && !*I)
+      ++I;
+    if (I == E)
+      return DT->getRoot();
+    BasicBlock *DomB = cast<BasicBlock>(*I);
+    while (++I != E) {
+      if (!*I)
+        continue;
+      BasicBlock *B = cast<BasicBlock>(*I);
+      if (DT->dominates(B, DomB))
+        continue;
+      if (!DT->dominates(DomB, B))
+        return 0;
+      DomB = B;
+    }
+    return DomB;
+  }
+
+  // Find the first use in B of any value from Values. If no such use,
+  // return B->end().
+  template <typename T>
+  BasicBlock::iterator first_use_of_in_block(T &Values, BasicBlock *B) {
+    BasicBlock::iterator FirstUse = B->end(), BEnd = B->end();
+    typedef typename T::iterator iterator;
+    for (iterator I = Values.begin(), E = Values.end(); I != E; ++I) {
+      Value *V = *I;
+      // If V is used in a PHI node, the use belongs to the incoming block,
+      // not the block with the PHI node. In the incoming block, the use
+      // would be considered as being at the end of it, so it cannot
+      // influence the position of the first use (which is assumed to be
+      // at the end to start with).
+      if (isa<PHINode>(V))
+        continue;
+      if (!isa<Instruction>(V))
+        continue;
+      Instruction *In = cast<Instruction>(V);
+      if (In->getParent() != B)
+        continue;
+      BasicBlock::iterator It = In;
+      if (std::distance(FirstUse, BEnd) < std::distance(It, BEnd))
+        FirstUse = It;
+    }
+    return FirstUse;
+  }
+
+  bool is_empty(const BasicBlock *B) {
+    return B->empty() || (&*B->begin() == B->getTerminator());
+  }
+}
+
+
+BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node,
+      NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+  DEBUG(dbgs() << "Loc for node:" << Node << '\n');
+  // Recalculate the placement for Node, assuming that the locations of
+  // its children in Loc are valid.
+  // Return 0 if there is no valid placement for Node (for example, it
+  // uses an index value that is not available at the location required
+  // to dominate all children, etc.).
+
+  // Find the nearest common dominator for:
+  // - all users, if the node is used, and
+  // - all children.
+  ValueVect Bs;
+  if (Node->Flags & GepNode::Used) {
+    // Append all blocks with uses of the original values to the
+    // block vector Bs.
+    NodeToUsesMap::iterator UF = Uses.find(Node);
+    assert(UF != Uses.end() && "Used node with no use information");
+    UseSet &Us = UF->second;
+    for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) {
+      Use *U = *I;
+      User *R = U->getUser();
+      if (!isa<Instruction>(R))
+        continue;
+      BasicBlock *PB = isa<PHINode>(R)
+          ? cast<PHINode>(R)->getIncomingBlock(*U)
+          : cast<Instruction>(R)->getParent();
+      Bs.push_back(PB);
+    }
+  }
+  // Append the location of each child.
+  NodeChildrenMap::iterator CF = NCM.find(Node);
+  if (CF != NCM.end()) {
+    NodeVect &Cs = CF->second;
+    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) {
+      GepNode *CN = *I;
+      NodeToValueMap::iterator LF = Loc.find(CN);
+      // If the child is only used in GEP instructions (i.e. is not used in
+      // non-GEP instructions), the nearest dominator computed for it may
+      // have been null. In such case it won't have a location available.
+      if (LF == Loc.end())
+        continue;
+      Bs.push_back(LF->second);
+    }
+  }
+
+  BasicBlock *DomB = nearest_common_dominator(DT, Bs);
+  if (!DomB)
+    return 0;
+  // Check if the index used by Node dominates the computed dominator.
+  Instruction *IdxI = dyn_cast<Instruction>(Node->Idx);
+  if (IdxI && !DT->dominates(IdxI->getParent(), DomB))
+    return 0;
+
+  // Avoid putting nodes into empty blocks.
+  while (is_empty(DomB)) {
+    DomTreeNode *N = (*DT)[DomB]->getIDom();
+    if (!N)
+      break;
+    DomB = N->getBlock();
+  }
+
+  // Otherwise, DomB is fine. Update the location map.
+  Loc[Node] = DomB;
+  return DomB;
+}
+
+
+BasicBlock *HexagonCommonGEP::recalculatePlacementRec(GepNode *Node,
+      NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+  DEBUG(dbgs() << "LocRec begin for node:" << Node << '\n');
+  // Recalculate the placement of Node, after recursively recalculating the
+  // placements of all its children.
+  NodeChildrenMap::iterator CF = NCM.find(Node);
+  if (CF != NCM.end()) {
+    NodeVect &Cs = CF->second;
+    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I)
+      recalculatePlacementRec(*I, NCM, Loc);
+  }
+  BasicBlock *LB = recalculatePlacement(Node, NCM, Loc);
+  DEBUG(dbgs() << "LocRec end for node:" << Node << '\n');
+  return LB;
+}
+
+
+bool HexagonCommonGEP::isInvariantIn(Value *Val, Loop *L) {
+  if (isa<Constant>(Val) || isa<Argument>(Val))
+    return true;
+  Instruction *In = dyn_cast<Instruction>(Val);
+  if (!In)
+    return false;
+  BasicBlock *HdrB = L->getHeader(), *DefB = In->getParent();
+  return DT->properlyDominates(DefB, HdrB);
+}
+
+
+bool HexagonCommonGEP::isInvariantIn(GepNode *Node, Loop *L) {
+  if (Node->Flags & GepNode::Root)
+    if (!isInvariantIn(Node->BaseVal, L))
+      return false;
+  return isInvariantIn(Node->Idx, L);
+}
+
+
+bool HexagonCommonGEP::isInMainPath(BasicBlock *B, Loop *L) {
+  BasicBlock *HB = L->getHeader();
+  BasicBlock *LB = L->getLoopLatch();
+  // B must post-dominate the loop header or dominate the loop latch.
+  if (PDT->dominates(B, HB))
+    return true;
+  if (LB && DT->dominates(B, LB))
+    return true;
+  return false;
+}
+
+
+namespace {
+  BasicBlock *preheader(DominatorTree *DT, Loop *L) {
+    if (BasicBlock *PH = L->getLoopPreheader())
+      return PH;
+    if (!OptSpeculate)
+      return 0;
+    DomTreeNode *DN = DT->getNode(L->getHeader());
+    if (!DN)
+      return 0;
+    return DN->getIDom()->getBlock();
+  }
+}
+
+
+BasicBlock *HexagonCommonGEP::adjustForInvariance(GepNode *Node,
+      NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+  // Find the "topmost" location for Node: it must be dominated by both,
+  // its parent (or the BaseVal, if it's a root node), and by the index
+  // value.
+  ValueVect Bs;
+  if (Node->Flags & GepNode::Root) {
+    if (Instruction *PIn = dyn_cast<Instruction>(Node->BaseVal))
+      Bs.push_back(PIn->getParent());
+  } else {
+    Bs.push_back(Loc[Node->Parent]);
+  }
+  if (Instruction *IIn = dyn_cast<Instruction>(Node->Idx))
+    Bs.push_back(IIn->getParent());
+  BasicBlock *TopB = nearest_common_dominatee(DT, Bs);
+
+  // Traverse the loop nest upwards until we find a loop in which Node
+  // is no longer invariant, or until we get to the upper limit of Node's
+  // placement. The traversal will also stop when a suitable "preheader"
+  // cannot be found for a given loop. The "preheader" may actually be
+  // a regular block outside of the loop (i.e. not guarded), in which case
+  // the Node will be speculated.
+  // For nodes that are not in the main path of the containing loop (i.e.
+  // are not executed in each iteration), do not move them out of the loop.
+  BasicBlock *LocB = cast_or_null<BasicBlock>(Loc[Node]);
+  if (LocB) {
+    Loop *Lp = LI->getLoopFor(LocB);
+    while (Lp) {
+      if (!isInvariantIn(Node, Lp) || !isInMainPath(LocB, Lp))
+        break;
+      BasicBlock *NewLoc = preheader(DT, Lp);
+      if (!NewLoc || !DT->dominates(TopB, NewLoc))
+        break;
+      Lp = Lp->getParentLoop();
+      LocB = NewLoc;
+    }
+  }
+  Loc[Node] = LocB;
+
+  // Recursively compute the locations of all children nodes.
+  NodeChildrenMap::iterator CF = NCM.find(Node);
+  if (CF != NCM.end()) {
+    NodeVect &Cs = CF->second;
+    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I)
+      adjustForInvariance(*I, NCM, Loc);
+  }
+  return LocB;
+}
+
+
+namespace {
+  struct LocationAsBlock {
+    LocationAsBlock(const NodeToValueMap &L) : Map(L) {}
+    const NodeToValueMap &Map;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS,
+                           const LocationAsBlock &Loc) LLVM_ATTRIBUTE_UNUSED ;
+  raw_ostream &operator<< (raw_ostream &OS, const LocationAsBlock &Loc) {
+    for (NodeToValueMap::const_iterator I = Loc.Map.begin(), E = Loc.Map.end();
+         I != E; ++I) {
+      OS << I->first << " -> ";
+      BasicBlock *B = cast<BasicBlock>(I->second);
+      OS << B->getName() << '(' << B << ')';
+      OS << '\n';
+    }
+    return OS;
+  }
+
+  inline bool is_constant(GepNode *N) {
+    return isa<ConstantInt>(N->Idx);
+  }
+}
+
+
+void HexagonCommonGEP::separateChainForNode(GepNode *Node, Use *U,
+      NodeToValueMap &Loc) {
+  User *R = U->getUser();
+  DEBUG(dbgs() << "Separating chain for node (" << Node << ") user: "
+               << *R << '\n');
+  BasicBlock *PB = cast<Instruction>(R)->getParent();
+
+  GepNode *N = Node;
+  GepNode *C = 0, *NewNode = 0;
+  while (is_constant(N) && !(N->Flags & GepNode::Root)) {
+    // XXX if (single-use) dont-replicate;
+    GepNode *NewN = new (*Mem) GepNode(N);
+    Nodes.push_back(NewN);
+    Loc[NewN] = PB;
+
+    if (N == Node)
+      NewNode = NewN;
+    NewN->Flags &= ~GepNode::Used;
+    if (C)
+      C->Parent = NewN;
+    C = NewN;
+    N = N->Parent;
+  }
+  if (!NewNode)
+    return;
+
+  // Move over all uses that share the same user as U from Node to NewNode.
+  NodeToUsesMap::iterator UF = Uses.find(Node);
+  assert(UF != Uses.end());
+  UseSet &Us = UF->second;
+  UseSet NewUs;
+  for (UseSet::iterator I = Us.begin(); I != Us.end(); ) {
+    User *S = (*I)->getUser();
+    UseSet::iterator Nx = std::next(I);
+    if (S == R) {
+      NewUs.insert(*I);
+      Us.erase(I);
+    }
+    I = Nx;
+  }
+  if (Us.empty()) {
+    Node->Flags &= ~GepNode::Used;
+    Uses.erase(UF);
+  }
+
+  // Should at least have U in NewUs.
+  NewNode->Flags |= GepNode::Used;
+  DEBUG(dbgs() << "new node: " << NewNode << "  " << *NewNode << '\n');
+  assert(!NewUs.empty());
+  Uses[NewNode] = NewUs;
+}
+
+
+void HexagonCommonGEP::separateConstantChains(GepNode *Node,
+      NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+  // First approximation: extract all chains.
+  NodeSet Ns;
+  nodes_for_root(Node, NCM, Ns);
+
+  DEBUG(dbgs() << "Separating constant chains for node: " << Node << '\n');
+  // Collect all used nodes together with the uses from loads and stores,
+  // where the GEP node could be folded into the load/store instruction.
+  NodeToUsesMap FNs; // Foldable nodes.
+  for (NodeSet::iterator I = Ns.begin(), E = Ns.end(); I != E; ++I) {
+    GepNode *N = *I;
+    if (!(N->Flags & GepNode::Used))
+      continue;
+    NodeToUsesMap::iterator UF = Uses.find(N);
+    assert(UF != Uses.end());
+    UseSet &Us = UF->second;
+    // Loads/stores that use the node N.
+    UseSet LSs;
+    for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J) {
+      Use *U = *J;
+      User *R = U->getUser();
+      // We're interested in uses that provide the address. It can happen
+      // that the value may also be provided via GEP, but we won't handle
+      // those cases here for now.
+      if (LoadInst *Ld = dyn_cast<LoadInst>(R)) {
+        unsigned PtrX = LoadInst::getPointerOperandIndex();
+        if (&Ld->getOperandUse(PtrX) == U)
+          LSs.insert(U);
+      } else if (StoreInst *St = dyn_cast<StoreInst>(R)) {
+        unsigned PtrX = StoreInst::getPointerOperandIndex();
+        if (&St->getOperandUse(PtrX) == U)
+          LSs.insert(U);
+      }
+    }
+    // Even if the total use count is 1, separating the chain may still be
+    // beneficial, since the constant chain may be longer than the GEP alone
+    // would be (e.g. if the parent node has a constant index and also has
+    // other children).
+    if (!LSs.empty())
+      FNs.insert(std::make_pair(N, LSs));
+  }
+
+  DEBUG(dbgs() << "Nodes with foldable users:\n" << FNs);
+
+  for (NodeToUsesMap::iterator I = FNs.begin(), E = FNs.end(); I != E; ++I) {
+    GepNode *N = I->first;
+    UseSet &Us = I->second;
+    for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J)
+      separateChainForNode(N, *J, Loc);
+  }
+}
+
+
+void HexagonCommonGEP::computeNodePlacement(NodeToValueMap &Loc) {
+  // Compute the inverse of the Node.Parent links. Also, collect the set
+  // of root nodes.
+  NodeChildrenMap NCM;
+  NodeVect Roots;
+  invert_find_roots(Nodes, NCM, Roots);
+
+  // Compute the initial placement determined by the users' locations, and
+  // the locations of the child nodes.
+  for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
+    recalculatePlacementRec(*I, NCM, Loc);
+
+  DEBUG(dbgs() << "Initial node placement:\n" << LocationAsBlock(Loc));
+
+  if (OptEnableInv) {
+    for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
+      adjustForInvariance(*I, NCM, Loc);
+
+    DEBUG(dbgs() << "Node placement after adjustment for invariance:\n"
+                 << LocationAsBlock(Loc));
+  }
+  if (OptEnableConst) {
+    for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
+      separateConstantChains(*I, NCM, Loc);
+  }
+  DEBUG(dbgs() << "Node use information:\n" << Uses);
+
+  // At the moment, there is no further refinement of the initial placement.
+  // Such a refinement could include splitting the nodes if they are placed
+  // too far from some of its users.
+
+  DEBUG(dbgs() << "Final node placement:\n" << LocationAsBlock(Loc));
+}
+
+
+Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
+      BasicBlock *LocB) {
+  DEBUG(dbgs() << "Fabricating GEP in " << LocB->getName()
+               << " for nodes:\n" << NA);
+  unsigned Num = NA.size();
+  GepNode *RN = NA[0];
+  assert((RN->Flags & GepNode::Root) && "Creating GEP for non-root");
+
+  Value *NewInst = 0;
+  Value *Input = RN->BaseVal;
+  Value **IdxList = new Value*[Num+1];
+  unsigned nax = 0;
+  do {
+    unsigned IdxC = 0;
+    // If the type of the input of the first node is not a pointer,
+    // we need to add an artificial i32 0 to the indices (because the
+    // actual input in the IR will be a pointer).
+    if (!NA[nax]->PTy->isPointerTy()) {
+      Type *Int32Ty = Type::getInt32Ty(*Ctx);
+      IdxList[IdxC++] = ConstantInt::get(Int32Ty, 0);
+    }
+
+    // Keep adding indices from NA until we have to stop and generate
+    // an "intermediate" GEP.
+    while (++nax <= Num) {
+      GepNode *N = NA[nax-1];
+      IdxList[IdxC++] = N->Idx;
+      if (nax < Num) {
+        // We have to stop, if the expected type of the output of this node
+        // is not the same as the input type of the next node.
+        Type *NextTy = next_type(N->PTy, N->Idx);
+        if (NextTy != NA[nax]->PTy)
+          break;
+      }
+    }
+    ArrayRef<Value*> A(IdxList, IdxC);
+    Type *InpTy = Input->getType();
+    Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType();
+    NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", At);
+    DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
+    Input = NewInst;
+  } while (nax <= Num);
+
+  delete[] IdxList;
+  return NewInst;
+}
+
+
+void HexagonCommonGEP::getAllUsersForNode(GepNode *Node, ValueVect &Values,
+      NodeChildrenMap &NCM) {
+  NodeVect Work;
+  Work.push_back(Node);
+
+  while (!Work.empty()) {
+    NodeVect::iterator First = Work.begin();
+    GepNode *N = *First;
+    Work.erase(First);
+    if (N->Flags & GepNode::Used) {
+      NodeToUsesMap::iterator UF = Uses.find(N);
+      assert(UF != Uses.end() && "No use information for used node");
+      UseSet &Us = UF->second;
+      for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I)
+        Values.push_back((*I)->getUser());
+    }
+    NodeChildrenMap::iterator CF = NCM.find(N);
+    if (CF != NCM.end()) {
+      NodeVect &Cs = CF->second;
+      Work.insert(Work.end(), Cs.begin(), Cs.end());
+    }
+  }
+}
+
+
+void HexagonCommonGEP::materialize(NodeToValueMap &Loc) {
+  DEBUG(dbgs() << "Nodes before materialization:\n" << Nodes << '\n');
+  NodeChildrenMap NCM;
+  NodeVect Roots;
+  // Compute the inversion again, since computing placement could alter
+  // "parent" relation between nodes.
+  invert_find_roots(Nodes, NCM, Roots);
+
+  while (!Roots.empty()) {
+    NodeVect::iterator First = Roots.begin();
+    GepNode *Root = *First, *Last = *First;
+    Roots.erase(First);
+
+    NodeVect NA;  // Nodes to assemble.
+    // Append to NA all child nodes up to (and including) the first child
+    // that:
+    // (1) has more than 1 child, or
+    // (2) is used, or
+    // (3) has a child located in a different block.
+    bool LastUsed = false;
+    unsigned LastCN = 0;
+    // The location may be null if the computation failed (it can legitimately
+    // happen for nodes created from dead GEPs).
+    Value *LocV = Loc[Last];
+    if (!LocV)
+      continue;
+    BasicBlock *LastB = cast<BasicBlock>(LocV);
+    do {
+      NA.push_back(Last);
+      LastUsed = (Last->Flags & GepNode::Used);
+      if (LastUsed)
+        break;
+      NodeChildrenMap::iterator CF = NCM.find(Last);
+      LastCN = (CF != NCM.end()) ? CF->second.size() : 0;
+      if (LastCN != 1)
+        break;
+      GepNode *Child = CF->second.front();
+      BasicBlock *ChildB = cast_or_null<BasicBlock>(Loc[Child]);
+      if (ChildB != 0 && LastB != ChildB)
+        break;
+      Last = Child;
+    } while (true);
+
+    BasicBlock::iterator InsertAt = LastB->getTerminator();
+    if (LastUsed || LastCN > 0) {
+      ValueVect Urs;
+      getAllUsersForNode(Root, Urs, NCM);
+      BasicBlock::iterator FirstUse = first_use_of_in_block(Urs, LastB);
+      if (FirstUse != LastB->end())
+        InsertAt = FirstUse;
+    }
+
+    // Generate a new instruction for NA.
+    Value *NewInst = fabricateGEP(NA, InsertAt, LastB);
+
+    // Convert all the children of Last node into roots, and append them
+    // to the Roots list.
+    if (LastCN > 0) {
+      NodeVect &Cs = NCM[Last];
+      for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) {
+        GepNode *CN = *I;
+        CN->Flags &= ~GepNode::Internal;
+        CN->Flags |= GepNode::Root;
+        CN->BaseVal = NewInst;
+        Roots.push_back(CN);
+      }
+    }
+
+    // Lastly, if the Last node was used, replace all uses with the new GEP.
+    // The uses reference the original GEP values.
+    if (LastUsed) {
+      NodeToUsesMap::iterator UF = Uses.find(Last);
+      assert(UF != Uses.end() && "No use information found");
+      UseSet &Us = UF->second;
+      for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) {
+        Use *U = *I;
+        U->set(NewInst);
+      }
+    }
+  }
+}
+
+
+void HexagonCommonGEP::removeDeadCode() {
+  ValueVect BO;
+  BO.push_back(&Fn->front());
+
+  for (unsigned i = 0; i < BO.size(); ++i) {
+    BasicBlock *B = cast<BasicBlock>(BO[i]);
+    DomTreeNode *N = DT->getNode(B);
+    typedef GraphTraits<DomTreeNode*> GTN;
+    typedef GTN::ChildIteratorType Iter;
+    for (Iter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
+      BO.push_back((*I)->getBlock());
+  }
+
+  for (unsigned i = BO.size(); i > 0; --i) {
+    BasicBlock *B = cast<BasicBlock>(BO[i-1]);
+    BasicBlock::InstListType &IL = B->getInstList();
+    typedef BasicBlock::InstListType::reverse_iterator reverse_iterator;
+    ValueVect Ins;
+    for (reverse_iterator I = IL.rbegin(), E = IL.rend(); I != E; ++I)
+      Ins.push_back(&*I);
+    for (ValueVect::iterator I = Ins.begin(), E = Ins.end(); I != E; ++I) {
+      Instruction *In = cast<Instruction>(*I);
+      if (isInstructionTriviallyDead(In))
+        In->eraseFromParent();
+    }
+  }
+}
+
+
+bool HexagonCommonGEP::runOnFunction(Function &F) {
+  // For now bail out on C++ exception handling.
+  for (Function::iterator A = F.begin(), Z = F.end(); A != Z; ++A)
+    for (BasicBlock::iterator I = A->begin(), E = A->end(); I != E; ++I)
+      if (isa<InvokeInst>(I) || isa<LandingPadInst>(I))
+        return false;
+
+  Fn = &F;
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  PDT = &getAnalysis<PostDominatorTree>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  Ctx = &F.getContext();
+
+  Nodes.clear();
+  Uses.clear();
+  NodeOrder.clear();
+
+  SpecificBumpPtrAllocator<GepNode> Allocator;
+  Mem = &Allocator;
+
+  collect();
+  common();
+
+  NodeToValueMap Loc;
+  computeNodePlacement(Loc);
+  materialize(Loc);
+  removeDeadCode();
+
+#ifdef XDEBUG
+  // Run this only when expensive checks are enabled.
+  verifyFunction(F);
+#endif
+  return true;
+}
+
+
+namespace llvm {
+  FunctionPass *createHexagonCommonGEP() {
+    return new HexagonCommonGEP();
+  }
+}
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 37ed173a79cd..ce10aeadef94 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -1,3 +1,12 @@
+//===--- HexagonExpandCondsets.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
 // Replace mux instructions with the corresponding legal instructions.
 // It is meant to work post-SSA, but still on virtual registers. It was
 // originally placed between register coalescing and machine instruction
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 868f87e18413..29283c81877e 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -864,13 +864,13 @@ static bool needToReserveScavengingSpillSlots(MachineFunction &MF,
   // Check for an unused caller-saved register.
   for ( ; *CallerSavedRegs; ++CallerSavedRegs) {
     MCPhysReg FreeReg = *CallerSavedRegs;
-    if (MRI.isPhysRegUsed(FreeReg))
+    if (!MRI.reg_nodbg_empty(FreeReg))
       continue;
 
     // Check aliased register usage.
     bool IsCurrentRegUsed = false;
     for (MCRegAliasIterator AI(FreeReg, &HRI, false); AI.isValid(); ++AI)
-      if (MRI.isPhysRegUsed(*AI)) {
+      if (!MRI.reg_nodbg_empty(*AI)) {
         IsCurrentRegUsed = true;
         break;
       }
@@ -959,8 +959,11 @@ bool HexagonFrameLowering::replacePredRegPseudoSpillCode(MachineFunction &MF)
 }
 
 
-void HexagonFrameLowering::processFunctionBeforeCalleeSavedScan(
-      MachineFunction &MF, RegScavenger* RS) const {
+void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                BitVector &SavedRegs,
+                                                RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
   auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget());
   auto &HRI = *HST.getRegisterInfo();
 
@@ -969,11 +972,9 @@ void HexagonFrameLowering::processFunctionBeforeCalleeSavedScan(
   // If we have a function containing __builtin_eh_return we want to spill and
   // restore all callee saved registers. Pretend that they are used.
   if (HasEHReturn) {
-    MachineRegisterInfo &MRI = MF.getRegInfo();
     for (const MCPhysReg *CSRegs = HRI.getCalleeSavedRegs(&MF); *CSRegs;
          ++CSRegs)
-      if (!MRI.isPhysRegUsed(*CSRegs))
-        MRI.setPhysRegUsed(*CSRegs);
+      SavedRegs.set(*CSRegs);
   }
 
   const TargetRegisterClass &RC = Hexagon::IntRegsRegClass;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index 89500cb85724..d39ee2c77195 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -45,7 +45,7 @@ public:
       MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const override;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
         RegScavenger *RS = nullptr) const override;
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
         RegScavenger *RS) const override;
 
   bool targetHandlesStackFrameRounding() const override {
diff --git a/lib/Target/Hexagon/HexagonGenExtract.cpp b/lib/Target/Hexagon/HexagonGenExtract.cpp
new file mode 100644
index 000000000000..4d32208bd5aa
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -0,0 +1,259 @@
+//===--- HexagonGenExtract.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> ExtractCutoff("extract-cutoff", cl::init(~0U),
+  cl::Hidden, cl::desc("Cutoff for generating \"extract\""
+  " instructions"));
+
+// This prevents generating extract instructions that have the offset of 0.
+// One of the reasons for "extract" is to put a sequence of bits in a regis-
+// ter, starting at offset 0 (so that these bits can then be used by an
+// "insert"). If the bits are already at offset 0, it is better not to gene-
+// rate "extract", since logical bit operations can be merged into compound
+// instructions (as opposed to "extract").
+static cl::opt<bool> NoSR0("extract-nosr0", cl::init(true), cl::Hidden,
+  cl::desc("No extract instruction with offset 0"));
+
+static cl::opt<bool> NeedAnd("extract-needand", cl::init(true), cl::Hidden,
+  cl::desc("Require & in extract patterns"));
+
+namespace llvm {
+  void initializeHexagonGenExtractPass(PassRegistry&);
+  FunctionPass *createHexagonGenExtract();
+}
+
+
+namespace {
+  class HexagonGenExtract : public FunctionPass {
+  public:
+    static char ID;
+    HexagonGenExtract() : FunctionPass(ID), ExtractCount(0) {
+      initializeHexagonGenExtractPass(*PassRegistry::getPassRegistry());
+    }
+    virtual const char *getPassName() const override {
+      return "Hexagon generate \"extract\" instructions";
+    }
+    virtual bool runOnFunction(Function &F) override;
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<MachineFunctionAnalysis>();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+  private:
+    bool visitBlock(BasicBlock *B);
+    bool convert(Instruction *In);
+
+    unsigned ExtractCount;
+    DominatorTree *DT;
+  };
+
+  char HexagonGenExtract::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(HexagonGenExtract, "hextract", "Hexagon generate "
+  "\"extract\" instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(HexagonGenExtract, "hextract", "Hexagon generate "
+  "\"extract\" instructions", false, false)
+
+
+bool HexagonGenExtract::convert(Instruction *In) {
+  using namespace PatternMatch;
+  Value *BF = 0;
+  ConstantInt *CSL = 0, *CSR = 0, *CM = 0;
+  BasicBlock *BB = In->getParent();
+  LLVMContext &Ctx = BB->getContext();
+  bool LogicalSR;
+
+  // (and (shl (lshr x, #sr), #sl), #m)
+  LogicalSR = true;
+  bool Match = match(In, m_And(m_Shl(m_LShr(m_Value(BF), m_ConstantInt(CSR)),
+                               m_ConstantInt(CSL)),
+                         m_ConstantInt(CM)));
+
+  if (!Match) {
+    // (and (shl (ashr x, #sr), #sl), #m)
+    LogicalSR = false;
+    Match = match(In, m_And(m_Shl(m_AShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CSL)),
+                      m_ConstantInt(CM)));
+  }
+  if (!Match) {
+    // (and (shl x, #sl), #m)
+    LogicalSR = true;
+    CSR = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+    Match = match(In, m_And(m_Shl(m_Value(BF), m_ConstantInt(CSL)),
+                      m_ConstantInt(CM)));
+    if (Match && NoSR0)
+      return false;
+  }
+  if (!Match) {
+    // (and (lshr x, #sr), #m)
+    LogicalSR = true;
+    CSL = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+    Match = match(In, m_And(m_LShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CM)));
+  }
+  if (!Match) {
+    // (and (ashr x, #sr), #m)
+    LogicalSR = false;
+    CSL = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+    Match = match(In, m_And(m_AShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CM)));
+  }
+  if (!Match) {
+    CM = 0;
+    // (shl (lshr x, #sr), #sl)
+    LogicalSR = true;
+    Match = match(In, m_Shl(m_LShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CSL)));
+  }
+  if (!Match) {
+    CM = 0;
+    // (shl (ashr x, #sr), #sl)
+    LogicalSR = false;
+    Match = match(In, m_Shl(m_AShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CSL)));
+  }
+  if (!Match)
+    return false;
+
+  Type *Ty = BF->getType();
+  if (!Ty->isIntegerTy())
+    return false;
+  unsigned BW = Ty->getPrimitiveSizeInBits();
+  if (BW != 32 && BW != 64)
+    return false;
+
+  uint32_t SR = CSR->getZExtValue();
+  uint32_t SL = CSL->getZExtValue();
+
+  if (!CM) {
+    // If there was no and, and the shift left did not remove all potential
+    // sign bits created by the shift right, then extractu cannot reproduce
+    // this value.
+    if (!LogicalSR && (SR > SL))
+      return false;
+    APInt A = APInt(BW, ~0ULL).lshr(SR).shl(SL);
+    CM = ConstantInt::get(Ctx, A);
+  }
+
+  // CM is the shifted-left mask. Shift it back right to remove the zero
+  // bits on least-significant positions.
+  APInt M = CM->getValue().lshr(SL);
+  uint32_t T = M.countTrailingOnes();
+
+  // During the shifts some of the bits will be lost. Calculate how many
+  // of the original value will remain after shift right and then left.
+  uint32_t U = BW - std::max(SL, SR);
+  // The width of the extracted field is the minimum of the original bits
+  // that remain after the shifts and the number of contiguous 1s in the mask.
+  uint32_t W = std::min(U, T);
+  if (W == 0)
+    return false;
+
+  // Check if the extracted bits are contained within the mask that it is
+  // and-ed with. The extract operation will copy these bits, and so the
+  // mask cannot any holes in it that would clear any of the bits of the
+  // extracted field.
+  if (!LogicalSR) {
+    // If the shift right was arithmetic, it could have included some 1 bits.
+    // It is still ok to generate extract, but only if the mask eliminates
+    // those bits (i.e. M does not have any bits set beyond U).
+    APInt C = APInt::getHighBitsSet(BW, BW-U);
+    if (M.intersects(C) || !APIntOps::isMask(W, M))
+      return false;
+  } else {
+    // Check if M starts with a contiguous sequence of W times 1 bits. Get
+    // the low U bits of M (which eliminates the 0 bits shifted in on the
+    // left), and check if the result is APInt's "mask":
+    if (!APIntOps::isMask(W, M.getLoBits(U)))
+      return false;
+  }
+
+  IRBuilder<> IRB(BB, In);
+  Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu
+                                   : Intrinsic::hexagon_S2_extractup;
+  Module *Mod = BB->getParent()->getParent();
+  Value *ExtF = Intrinsic::getDeclaration(Mod, IntId);
+  Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)});
+  if (SL != 0)
+    NewIn = IRB.CreateShl(NewIn, SL, CSL->getName());
+  In->replaceAllUsesWith(NewIn);
+  return true;
+}
+
+
+bool HexagonGenExtract::visitBlock(BasicBlock *B) {
+  // Depth-first, bottom-up traversal.
+  DomTreeNode *DTN = DT->getNode(B);
+  typedef GraphTraits<DomTreeNode*> GTN;
+  typedef GTN::ChildIteratorType Iter;
+  for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
+    visitBlock((*I)->getBlock());
+
+  // Allow limiting the number of generated extracts for debugging purposes.
+  bool HasCutoff = ExtractCutoff.getPosition();
+  unsigned Cutoff = ExtractCutoff;
+
+  bool Changed = false;
+  BasicBlock::iterator I = std::prev(B->end()), NextI, Begin = B->begin();
+  while (true) {
+    if (HasCutoff && (ExtractCount >= Cutoff))
+      return Changed;
+    bool Last = (I == Begin);
+    if (!Last)
+      NextI = std::prev(I);
+    Instruction *In = &*I;
+    bool Done = convert(In);
+    if (HasCutoff && Done)
+      ExtractCount++;
+    Changed |= Done;
+    if (Last)
+      break;
+    I = NextI;
+  }
+  return Changed;
+}
+
+
+bool HexagonGenExtract::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  bool Changed;
+
+  // Traverse the function bottom-up, to see super-expressions before their
+  // sub-expressions.
+  BasicBlock *Entry = GraphTraits<Function*>::getEntryNode(&F);
+  Changed = visitBlock(Entry);
+
+  return Changed;
+}
+
+
+FunctionPass *llvm::createHexagonGenExtract() {
+  return new HexagonGenExtract();
+}
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
new file mode 100644
index 000000000000..096da949e77b
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -0,0 +1,1598 @@
+//===--- HexagonGenInsert.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexinsert"
+
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonBitTracker.h"
+
+#include <map>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<unsigned> VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg# cutoff for insert generation."));
+// The distance cutoff is selected based on the precheckin-perf results:
+// cutoffs 20, 25, 35, and 40 are worse than 30.
+static cl::opt<unsigned> VRegDistCutoff("insert-dist-cutoff", cl::init(30U),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg distance cutoff for insert "
+  "generation."));
+
+static cl::opt<bool> OptTiming("insert-timing", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore, cl::desc("Enable timing of insert generation"));
+static cl::opt<bool> OptTimingDetail("insert-timing-detail", cl::init(false),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Enable detailed timing of insert "
+  "generation"));
+
+static cl::opt<bool> OptSelectAll0("insert-all0", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore);
+static cl::opt<bool> OptSelectHas0("insert-has0", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore);
+// Whether to construct constant values via "insert". Could eliminate constant
+// extenders, but often not practical.
+static cl::opt<bool> OptConst("insert-const", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore);
+
+namespace {
+  // The preprocessor gets confused when the DEBUG macro is passed larger
+  // chunks of code. Use this function to detect debugging.
+  inline bool isDebug() {
+#ifndef NDEBUG
+    return ::llvm::DebugFlag && ::llvm::isCurrentDebugType(DEBUG_TYPE);
+#else
+    return false;
+#endif
+  }
+}
+
+
+namespace {
+  // Set of virtual registers, based on BitVector.
+  struct RegisterSet : private BitVector {
+    RegisterSet() : BitVector() {}
+    explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
+    RegisterSet(const RegisterSet &RS) : BitVector(RS) {}
+
+    using BitVector::clear;
+
+    unsigned find_first() const {
+      int First = BitVector::find_first();
+      if (First < 0)
+        return 0;
+      return x2v(First);
+    }
+
+    unsigned find_next(unsigned Prev) const {
+      int Next = BitVector::find_next(v2x(Prev));
+      if (Next < 0)
+        return 0;
+      return x2v(Next);
+    }
+
+    RegisterSet &insert(unsigned R) {
+      unsigned Idx = v2x(R);
+      ensure(Idx);
+      return static_cast<RegisterSet&>(BitVector::set(Idx));
+    }
+    RegisterSet &remove(unsigned R) {
+      unsigned Idx = v2x(R);
+      if (Idx >= size())
+        return *this;
+      return static_cast<RegisterSet&>(BitVector::reset(Idx));
+    }
+
+    RegisterSet &insert(const RegisterSet &Rs) {
+      return static_cast<RegisterSet&>(BitVector::operator|=(Rs));
+    }
+    RegisterSet &remove(const RegisterSet &Rs) {
+      return static_cast<RegisterSet&>(BitVector::reset(Rs));
+    }
+
+    reference operator[](unsigned R) {
+      unsigned Idx = v2x(R);
+      ensure(Idx);
+      return BitVector::operator[](Idx);
+    }
+    bool operator[](unsigned R) const {
+      unsigned Idx = v2x(R);
+      assert(Idx < size());
+      return BitVector::operator[](Idx);
+    }
+    bool has(unsigned R) const {
+      unsigned Idx = v2x(R);
+      if (Idx >= size())
+        return false;
+      return BitVector::test(Idx);
+    }
+
+    bool empty() const {
+      return !BitVector::any();
+    }
+    bool includes(const RegisterSet &Rs) const {
+      // A.BitVector::test(B)  <=>  A-B != {}
+      return !Rs.BitVector::test(*this);
+    }
+    bool intersects(const RegisterSet &Rs) const {
+      return BitVector::anyCommon(Rs);
+    }
+
+  private:
+    void ensure(unsigned Idx) {
+      if (size() <= Idx)
+        resize(std::max(Idx+1, 32U));
+    }
+    static inline unsigned v2x(unsigned v) {
+      return TargetRegisterInfo::virtReg2Index(v);
+    }
+    static inline unsigned x2v(unsigned x) {
+      return TargetRegisterInfo::index2VirtReg(x);
+    }
+  };
+
+
+  struct PrintRegSet {
+    PrintRegSet(const RegisterSet &S, const TargetRegisterInfo *RI)
+      : RS(S), TRI(RI) {}
+    friend raw_ostream &operator<< (raw_ostream &OS,
+          const PrintRegSet &P);
+  private:
+    const RegisterSet &RS;
+    const TargetRegisterInfo *TRI;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) {
+    OS << '{';
+    for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R))
+      OS << ' ' << PrintReg(R, P.TRI);
+    OS << " }";
+    return OS;
+  }
+}
+
+
+namespace {
+  // A convenience class to associate unsigned numbers (such as virtual
+  // registers) with unsigned numbers.
+  struct UnsignedMap : public DenseMap<unsigned,unsigned> {
+    UnsignedMap() : BaseType() {}
+  private:
+    typedef DenseMap<unsigned,unsigned> BaseType;
+  };
+
+  // A utility to establish an ordering between virtual registers:
+  // VRegA < VRegB  <=>  RegisterOrdering[VRegA] < RegisterOrdering[VRegB]
+  // This is meant as a cache for the ordering of virtual registers defined
+  // by a potentially expensive comparison function, or obtained by a proce-
+  // dure that should not be repeated each time two registers are compared.
+  struct RegisterOrdering : public UnsignedMap {
+    RegisterOrdering() : UnsignedMap() {}
+    unsigned operator[](unsigned VR) const {
+      const_iterator F = find(VR);
+      assert(F != end());
+      return F->second;
+    }
+    // Add operator(), so that objects of this class can be used as
+    // comparators in std::sort et al.
+    bool operator() (unsigned VR1, unsigned VR2) const {
+      return operator[](VR1) < operator[](VR2);
+    }
+  };
+}
+
+
+namespace {
+  // Ordering of bit values. This class does not have operator[], but
+  // is supplies a comparison operator() for use in std:: algorithms.
+  // The order is as follows:
+  // - 0 < 1 < ref
+  // - ref1 < ref2, if ord(ref1.Reg) < ord(ref2.Reg),
+  //   or ord(ref1.Reg) == ord(ref2.Reg), and ref1.Pos < ref2.Pos.
+  struct BitValueOrdering {
+    BitValueOrdering(const RegisterOrdering &RB) : BaseOrd(RB) {}
+    bool operator() (const BitTracker::BitValue &V1,
+          const BitTracker::BitValue &V2) const;
+    const RegisterOrdering &BaseOrd;
+  };
+}
+
+
+bool BitValueOrdering::operator() (const BitTracker::BitValue &V1,
+      const BitTracker::BitValue &V2) const {
+  if (V1 == V2)
+    return false;
+  // V1==0 => true, V2==0 => false
+  if (V1.is(0) || V2.is(0))
+    return V1.is(0);
+  // Neither of V1,V2 is 0, and V1!=V2.
+  // V2==1 => false, V1==1 => true
+  if (V2.is(1) || V1.is(1))
+    return !V2.is(1);
+  // Both V1,V2 are refs.
+  unsigned Ind1 = BaseOrd[V1.RefI.Reg], Ind2 = BaseOrd[V2.RefI.Reg];
+  if (Ind1 != Ind2)
+    return Ind1 < Ind2;
+  // If V1.Pos==V2.Pos
+  assert(V1.RefI.Pos != V2.RefI.Pos && "Bit values should be different");
+  return V1.RefI.Pos < V2.RefI.Pos;
+}
+
+
+namespace {
+  // Cache for the BitTracker's cell map. Map lookup has a logarithmic
+  // complexity, this class will memoize the lookup results to reduce
+  // the access time for repeated lookups of the same cell.
+  struct CellMapShadow {
+    CellMapShadow(const BitTracker &T) : BT(T) {}
+    const BitTracker::RegisterCell &lookup(unsigned VR) {
+      unsigned RInd = TargetRegisterInfo::virtReg2Index(VR);
+      // Grow the vector to at least 32 elements.
+      if (RInd >= CVect.size())
+        CVect.resize(std::max(RInd+16, 32U), 0);
+      const BitTracker::RegisterCell *CP = CVect[RInd];
+      if (CP == 0)
+        CP = CVect[RInd] = &BT.lookup(VR);
+      return *CP;
+    }
+
+    const BitTracker &BT;
+
+  private:
+    typedef std::vector<const BitTracker::RegisterCell*> CellVectType;
+    CellVectType CVect;
+  };
+}
+
+
+namespace {
+  // Comparator class for lexicographic ordering of virtual registers
+  // according to the corresponding BitTracker::RegisterCell objects.
+  struct RegisterCellLexCompare {
+    RegisterCellLexCompare(const BitValueOrdering &BO, CellMapShadow &M)
+      : BitOrd(BO), CM(M) {}
+    bool operator() (unsigned VR1, unsigned VR2) const;
+  private:
+    const BitValueOrdering &BitOrd;
+    CellMapShadow &CM;
+  };
+
+  // Comparator class for lexicographic ordering of virtual registers
+  // according to the specified bits of the corresponding BitTracker::
+  // RegisterCell objects.
+  // Specifically, this class will be used to compare bit B of a register
+  // cell for a selected virtual register R with bit N of any register
+  // other than R.
+  struct RegisterCellBitCompareSel {
+    RegisterCellBitCompareSel(unsigned R, unsigned B, unsigned N,
+          const BitValueOrdering &BO, CellMapShadow &M)
+      : SelR(R), SelB(B), BitN(N), BitOrd(BO), CM(M) {}
+    bool operator() (unsigned VR1, unsigned VR2) const;
+  private:
+    const unsigned SelR, SelB;
+    const unsigned BitN;
+    const BitValueOrdering &BitOrd;
+    CellMapShadow &CM;
+  };
+}
+
+
+bool RegisterCellLexCompare::operator() (unsigned VR1, unsigned VR2) const {
+  // Ordering of registers, made up from two given orderings:
+  // - the ordering of the register numbers, and
+  // - the ordering of register cells.
+  // Def. R1 < R2 if:
+  // - cell(R1) < cell(R2), or
+  // - cell(R1) == cell(R2), and index(R1) < index(R2).
+  //
+  // For register cells, the ordering is lexicographic, with index 0 being
+  // the most significant.
+  if (VR1 == VR2)
+    return false;
+
+  const BitTracker::RegisterCell &RC1 = CM.lookup(VR1), &RC2 = CM.lookup(VR2);
+  uint16_t W1 = RC1.width(), W2 = RC2.width();
+  for (uint16_t i = 0, w = std::min(W1, W2); i < w; ++i) {
+    const BitTracker::BitValue &V1 = RC1[i], &V2 = RC2[i];
+    if (V1 != V2)
+      return BitOrd(V1, V2);
+  }
+  // Cells are equal up until the common length.
+  if (W1 != W2)
+    return W1 < W2;
+
+  return BitOrd.BaseOrd[VR1] < BitOrd.BaseOrd[VR2];
+}
+
+
+bool RegisterCellBitCompareSel::operator() (unsigned VR1, unsigned VR2) const {
+  if (VR1 == VR2)
+    return false;
+  const BitTracker::RegisterCell &RC1 = CM.lookup(VR1);
+  const BitTracker::RegisterCell &RC2 = CM.lookup(VR2);
+  uint16_t W1 = RC1.width(), W2 = RC2.width();
+  uint16_t Bit1 = (VR1 == SelR) ? SelB : BitN;
+  uint16_t Bit2 = (VR2 == SelR) ? SelB : BitN;
+  // If Bit1 exceeds the width of VR1, then:
+  // - return false, if at the same time Bit2 exceeds VR2, or
+  // - return true, otherwise.
+  // (I.e. "a bit value that does not exist is less than any bit value
+  // that does exist".)
+  if (W1 <= Bit1)
+    return Bit2 < W2;
+  // If Bit1 is within VR1, but Bit2 is not within VR2, return false.
+  if (W2 <= Bit2)
+    return false;
+
+  const BitTracker::BitValue &V1 = RC1[Bit1], V2 = RC2[Bit2];
+  if (V1 != V2)
+    return BitOrd(V1, V2);
+  return false;
+}
+
+
+namespace {
+  class OrderedRegisterList {
+    typedef std::vector<unsigned> ListType;
+  public:
+    OrderedRegisterList(const RegisterOrdering &RO) : Ord(RO) {}
+    void insert(unsigned VR);
+    void remove(unsigned VR);
+    unsigned operator[](unsigned Idx) const {
+      assert(Idx < Seq.size());
+      return Seq[Idx];
+    }
+    unsigned size() const {
+      return Seq.size();
+    }
+
+    typedef ListType::iterator iterator;
+    typedef ListType::const_iterator const_iterator;
+    iterator begin() { return Seq.begin(); }
+    iterator end() { return Seq.end(); }
+    const_iterator begin() const { return Seq.begin(); }
+    const_iterator end() const { return Seq.end(); }
+
+    // Convenience function to convert an iterator to the corresponding index.
+    unsigned idx(iterator It) const { return It-begin(); }
+  private:
+    ListType Seq;
+    const RegisterOrdering &Ord;
+  };
+
+
+  struct PrintORL {
+    PrintORL(const OrderedRegisterList &L, const TargetRegisterInfo *RI)
+      : RL(L), TRI(RI) {}
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintORL &P);
+  private:
+    const OrderedRegisterList &RL;
+    const TargetRegisterInfo *TRI;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PrintORL &P) {
+    OS << '(';
+    OrderedRegisterList::const_iterator B = P.RL.begin(), E = P.RL.end();
+    for (OrderedRegisterList::const_iterator I = B; I != E; ++I) {
+      if (I != B)
+        OS << ", ";
+      OS << PrintReg(*I, P.TRI);
+    }
+    OS << ')';
+    return OS;
+  }
+}
+
+
+void OrderedRegisterList::insert(unsigned VR) {
+  iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord);
+  if (L == Seq.end())
+    Seq.push_back(VR);
+  else
+    Seq.insert(L, VR);
+}
+
+
+void OrderedRegisterList::remove(unsigned VR) {
+  iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord);
+  assert(L != Seq.end());
+  Seq.erase(L);
+}
+
+
+namespace {
+  // A record of the insert form. The fields correspond to the operands
+  // of the "insert" instruction:
+  // ... = insert(SrcR, InsR, #Wdh, #Off)
+  struct IFRecord {
+    IFRecord(unsigned SR = 0, unsigned IR = 0, uint16_t W = 0, uint16_t O = 0)
+      : SrcR(SR), InsR(IR), Wdh(W), Off(O) {}
+    unsigned SrcR, InsR;
+    uint16_t Wdh, Off;
+  };
+
+  struct PrintIFR {
+    PrintIFR(const IFRecord &R, const TargetRegisterInfo *RI)
+      : IFR(R), TRI(RI) {}
+  private:
+    const IFRecord &IFR;
+    const TargetRegisterInfo *TRI;
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintIFR &P);
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PrintIFR &P) {
+    unsigned SrcR = P.IFR.SrcR, InsR = P.IFR.InsR;
+    OS << '(' << PrintReg(SrcR, P.TRI) << ',' << PrintReg(InsR, P.TRI)
+       << ",#" << P.IFR.Wdh << ",#" << P.IFR.Off << ')';
+    return OS;
+  }
+
+  typedef std::pair<IFRecord,RegisterSet> IFRecordWithRegSet;
+}
+
+
+namespace llvm {
+  void initializeHexagonGenInsertPass(PassRegistry&);
+  FunctionPass *createHexagonGenInsert();
+}
+
+
+namespace {
+  class HexagonGenInsert : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonGenInsert() : MachineFunctionPass(ID), HII(0), HRI(0) {
+      initializeHexagonGenInsertPass(*PassRegistry::getPassRegistry());
+    }
+    virtual const char *getPassName() const {
+      return "Hexagon generate \"insert\" instructions";
+    }
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  private:
+    typedef DenseMap<std::pair<unsigned,unsigned>,unsigned> PairMapType;
+
+    void buildOrderingMF(RegisterOrdering &RO) const;
+    void buildOrderingBT(RegisterOrdering &RB, RegisterOrdering &RO) const;
+    bool isIntClass(const TargetRegisterClass *RC) const;
+    bool isConstant(unsigned VR) const;
+    bool isSmallConstant(unsigned VR) const;
+    bool isValidInsertForm(unsigned DstR, unsigned SrcR, unsigned InsR,
+          uint16_t L, uint16_t S) const;
+    bool findSelfReference(unsigned VR) const;
+    bool findNonSelfReference(unsigned VR) const;
+    void getInstrDefs(const MachineInstr *MI, RegisterSet &Defs) const;
+    void getInstrUses(const MachineInstr *MI, RegisterSet &Uses) const;
+    unsigned distance(const MachineBasicBlock *FromB,
+          const MachineBasicBlock *ToB, const UnsignedMap &RPO,
+          PairMapType &M) const;
+    unsigned distance(MachineBasicBlock::const_iterator FromI,
+          MachineBasicBlock::const_iterator ToI, const UnsignedMap &RPO,
+          PairMapType &M) const;
+    bool findRecordInsertForms(unsigned VR, OrderedRegisterList &AVs);
+    void collectInBlock(MachineBasicBlock *B, OrderedRegisterList &AVs);
+    void findRemovableRegisters(unsigned VR, IFRecord IF,
+          RegisterSet &RMs) const;
+    void computeRemovableRegisters();
+
+    void pruneEmptyLists();
+    void pruneCoveredSets(unsigned VR);
+    void pruneUsesTooFar(unsigned VR, const UnsignedMap &RPO, PairMapType &M);
+    void pruneRegCopies(unsigned VR);
+    void pruneCandidates();
+    void selectCandidates();
+    bool generateInserts();
+
+    bool removeDeadCode(MachineDomTreeNode *N);
+
+    // IFRecord coupled with a set of potentially removable registers:
+    typedef std::vector<IFRecordWithRegSet> IFListType;
+    typedef DenseMap<unsigned,IFListType> IFMapType;  // vreg -> IFListType
+
+    void dump_map() const;
+
+    const HexagonInstrInfo *HII;
+    const HexagonRegisterInfo *HRI;
+
+    MachineFunction *MFN;
+    MachineRegisterInfo *MRI;
+    MachineDominatorTree *MDT;
+    CellMapShadow *CMS;
+
+    RegisterOrdering BaseOrd;
+    RegisterOrdering CellOrd;
+    IFMapType IFMap;
+  };
+
+  char HexagonGenInsert::ID = 0;
+}
+
+
+void HexagonGenInsert::dump_map() const {
+  typedef IFMapType::const_iterator iterator;
+  for (iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    dbgs() << "  " << PrintReg(I->first, HRI) << ":\n";
+    const IFListType &LL = I->second;
+    for (unsigned i = 0, n = LL.size(); i < n; ++i)
+      dbgs() << "    " << PrintIFR(LL[i].first, HRI) << ", "
+             << PrintRegSet(LL[i].second, HRI) << '\n';
+  }
+}
+
+
+void HexagonGenInsert::buildOrderingMF(RegisterOrdering &RO) const {
+  unsigned Index = 0;
+  typedef MachineFunction::const_iterator mf_iterator;
+  for (mf_iterator A = MFN->begin(), Z = MFN->end(); A != Z; ++A) {
+    const MachineBasicBlock &B = *A;
+    if (!CMS->BT.reached(&B))
+      continue;
+    typedef MachineBasicBlock::const_iterator mb_iterator;
+    for (mb_iterator I = B.begin(), E = B.end(); I != E; ++I) {
+      const MachineInstr *MI = &*I;
+      for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.isDef()) {
+          unsigned R = MO.getReg();
+          assert(MO.getSubReg() == 0 && "Unexpected subregister in definition");
+          if (TargetRegisterInfo::isVirtualRegister(R))
+            RO.insert(std::make_pair(R, Index++));
+        }
+      }
+    }
+  }
+  // Since some virtual registers may have had their def and uses eliminated,
+  // they are no longer referenced in the code, and so they will not appear
+  // in the map.
+}
+
+
+void HexagonGenInsert::buildOrderingBT(RegisterOrdering &RB,
+      RegisterOrdering &RO) const {
+  // Create a vector of all virtual registers (collect them from the base
+  // ordering RB), and then sort it using the RegisterCell comparator.
+  BitValueOrdering BVO(RB);
+  RegisterCellLexCompare LexCmp(BVO, *CMS);
+  typedef std::vector<unsigned> SortableVectorType;
+  SortableVectorType VRs;
+  for (RegisterOrdering::iterator I = RB.begin(), E = RB.end(); I != E; ++I)
+    VRs.push_back(I->first);
+  std::sort(VRs.begin(), VRs.end(), LexCmp);
+  // Transfer the results to the outgoing register ordering.
+  for (unsigned i = 0, n = VRs.size(); i < n; ++i)
+    RO.insert(std::make_pair(VRs[i], i));
+}
+
+
+inline bool HexagonGenInsert::isIntClass(const TargetRegisterClass *RC) const {
+  return RC == &Hexagon::IntRegsRegClass || RC == &Hexagon::DoubleRegsRegClass;
+}
+
+
+bool HexagonGenInsert::isConstant(unsigned VR) const {
+  const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+  uint16_t W = RC.width();
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitTracker::BitValue &BV = RC[i];
+    if (BV.is(0) || BV.is(1))
+      continue;
+    return false;
+  }
+  return true;
+}
+
+
+bool HexagonGenInsert::isSmallConstant(unsigned VR) const {
+  const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+  uint16_t W = RC.width();
+  if (W > 64)
+    return false;
+  uint64_t V = 0, B = 1;
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitTracker::BitValue &BV = RC[i];
+    if (BV.is(1))
+      V |= B;
+    else if (!BV.is(0))
+      return false;
+    B <<= 1;
+  }
+
+  // For 32-bit registers, consider: Rd = #s16.
+  if (W == 32)
+    return isInt<16>(V);
+
+  // For 64-bit registers, it's Rdd = #s8 or Rdd = combine(#s8,#s8)
+  return isInt<8>(Lo_32(V)) && isInt<8>(Hi_32(V));
+}
+
+
+bool HexagonGenInsert::isValidInsertForm(unsigned DstR, unsigned SrcR,
+      unsigned InsR, uint16_t L, uint16_t S) const {
+  const TargetRegisterClass *DstRC = MRI->getRegClass(DstR);
+  const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcR);
+  const TargetRegisterClass *InsRC = MRI->getRegClass(InsR);
+  // Only integet (32-/64-bit) register classes.
+  if (!isIntClass(DstRC) || !isIntClass(SrcRC) || !isIntClass(InsRC))
+    return false;
+  // The "source" register must be of the same class as DstR.
+  if (DstRC != SrcRC)
+    return false;
+  if (DstRC == InsRC)
+    return true;
+  // A 64-bit register can only be generated from other 64-bit registers.
+  if (DstRC == &Hexagon::DoubleRegsRegClass)
+    return false;
+  // Otherwise, the L and S cannot span 32-bit word boundary.
+  if (S < 32 && S+L > 32)
+    return false;
+  return true;
+}
+
+
+bool HexagonGenInsert::findSelfReference(unsigned VR) const {
+  const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+  for (uint16_t i = 0, w = RC.width(); i < w; ++i) {
+    const BitTracker::BitValue &V = RC[i];
+    if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg == VR)
+      return true;
+  }
+  return false;
+}
+
+
+bool HexagonGenInsert::findNonSelfReference(unsigned VR) const {
+  BitTracker::RegisterCell RC = CMS->lookup(VR);
+  for (uint16_t i = 0, w = RC.width(); i < w; ++i) {
+    const BitTracker::BitValue &V = RC[i];
+    if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg != VR)
+      return true;
+  }
+  return false;
+}
+
+
+void HexagonGenInsert::getInstrDefs(const MachineInstr *MI,
+      RegisterSet &Defs) const {
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned R = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    Defs.insert(R);
+  }
+}
+
+
+void HexagonGenInsert::getInstrUses(const MachineInstr *MI,
+      RegisterSet &Uses) const {
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned R = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    Uses.insert(R);
+  }
+}
+
+
+unsigned HexagonGenInsert::distance(const MachineBasicBlock *FromB,
+      const MachineBasicBlock *ToB, const UnsignedMap &RPO,
+      PairMapType &M) const {
+  // Forward distance from the end of a block to the beginning of it does
+  // not make sense. This function should not be called with FromB == ToB.
+  assert(FromB != ToB);
+
+  unsigned FromN = FromB->getNumber(), ToN = ToB->getNumber();
+  // If we have already computed it, return the cached result.
+  PairMapType::iterator F = M.find(std::make_pair(FromN, ToN));
+  if (F != M.end())
+    return F->second;
+  unsigned ToRPO = RPO.lookup(ToN);
+
+  unsigned MaxD = 0;
+  typedef MachineBasicBlock::const_pred_iterator pred_iterator;
+  for (pred_iterator I = ToB->pred_begin(), E = ToB->pred_end(); I != E; ++I) {
+    const MachineBasicBlock *PB = *I;
+    // Skip back edges. Also, if FromB is a predecessor of ToB, the distance
+    // along that path will be 0, and we don't need to do any calculations
+    // on it.
+    if (PB == FromB || RPO.lookup(PB->getNumber()) >= ToRPO)
+      continue;
+    unsigned D = PB->size() + distance(FromB, PB, RPO, M);
+    if (D > MaxD)
+      MaxD = D;
+  }
+
+  // Memoize the result for later lookup.
+  M.insert(std::make_pair(std::make_pair(FromN, ToN), MaxD));
+  return MaxD;
+}
+
+
+unsigned HexagonGenInsert::distance(MachineBasicBlock::const_iterator FromI,
+      MachineBasicBlock::const_iterator ToI, const UnsignedMap &RPO,
+      PairMapType &M) const {
+  const MachineBasicBlock *FB = FromI->getParent(), *TB = ToI->getParent();
+  if (FB == TB)
+    return std::distance(FromI, ToI);
+  unsigned D1 = std::distance(TB->begin(), ToI);
+  unsigned D2 = distance(FB, TB, RPO, M);
+  unsigned D3 = std::distance(FromI, FB->end());
+  return D1+D2+D3;
+}
+
+
+bool HexagonGenInsert::findRecordInsertForms(unsigned VR,
+      OrderedRegisterList &AVs) {
+  if (isDebug()) {
+    dbgs() << LLVM_FUNCTION_NAME << ": " << PrintReg(VR, HRI)
+           << "  AVs: " << PrintORL(AVs, HRI) << "\n";
+  }
+  if (AVs.size() == 0)
+    return false;
+
+  typedef OrderedRegisterList::iterator iterator;
+  BitValueOrdering BVO(BaseOrd);
+  const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+  uint16_t W = RC.width();
+
+  typedef std::pair<unsigned,uint16_t> RSRecord;  // (reg,shift)
+  typedef std::vector<RSRecord> RSListType;
+  // Have a map, with key being the matching prefix length, and the value
+  // being the list of pairs (R,S), where R's prefix matches VR at S.
+  // (DenseMap<uint16_t,RSListType> fails to instantiate.)
+  typedef DenseMap<unsigned,RSListType> LRSMapType;
+  LRSMapType LM;
+
+  // Conceptually, rotate the cell RC right (i.e. towards the LSB) by S,
+  // and find matching prefixes from AVs with the rotated RC. Such a prefix
+  // would match a string of bits (of length L) in RC starting at S.
+  for (uint16_t S = 0; S < W; ++S) {
+    iterator B = AVs.begin(), E = AVs.end();
+    // The registers in AVs are ordered according to the lexical order of
+    // the corresponding register cells. This means that the range of regis-
+    // ters in AVs that match a prefix of length L+1 will be contained in
+    // the range that matches a prefix of length L. This means that we can
+    // keep narrowing the search space as the prefix length goes up. This
+    // helps reduce the overall complexity of the search.
+    uint16_t L;
+    for (L = 0; L < W-S; ++L) {
+      // Compare against VR's bits starting at S, which emulates rotation
+      // of VR by S.
+      RegisterCellBitCompareSel RCB(VR, S+L, L, BVO, *CMS);
+      iterator NewB = std::lower_bound(B, E, VR, RCB);
+      iterator NewE = std::upper_bound(NewB, E, VR, RCB);
+      // For the registers that are eliminated from the next range, L is
+      // the longest prefix matching VR at position S (their prefixes
+      // differ from VR at S+L). If L>0, record this information for later
+      // use.
+      if (L > 0) {
+        for (iterator I = B; I != NewB; ++I)
+          LM[L].push_back(std::make_pair(*I, S));
+        for (iterator I = NewE; I != E; ++I)
+          LM[L].push_back(std::make_pair(*I, S));
+      }
+      B = NewB, E = NewE;
+      if (B == E)
+        break;
+    }
+    // Record the final register range. If this range is non-empty, then
+    // L=W-S.
+    assert(B == E || L == W-S);
+    if (B != E) {
+      for (iterator I = B; I != E; ++I)
+        LM[L].push_back(std::make_pair(*I, S));
+      // If B!=E, then we found a range of registers whose prefixes cover the
+      // rest of VR from position S. There is no need to further advance S.
+      break;
+    }
+  }
+
+  if (isDebug()) {
+    dbgs() << "Prefixes matching register " << PrintReg(VR, HRI) << "\n";
+    for (LRSMapType::iterator I = LM.begin(), E = LM.end(); I != E; ++I) {
+      dbgs() << "  L=" << I->first << ':';
+      const RSListType &LL = I->second;
+      for (unsigned i = 0, n = LL.size(); i < n; ++i)
+        dbgs() << " (" << PrintReg(LL[i].first, HRI) << ",@"
+               << LL[i].second << ')';
+      dbgs() << '\n';
+    }
+  }
+
+
+  bool Recorded = false;
+
+  for (iterator I = AVs.begin(), E = AVs.end(); I != E; ++I) {
+    unsigned SrcR = *I;
+    int FDi = -1, LDi = -1;   // First/last different bit.
+    const BitTracker::RegisterCell &AC = CMS->lookup(SrcR);
+    uint16_t AW = AC.width();
+    for (uint16_t i = 0, w = std::min(W, AW); i < w; ++i) {
+      if (RC[i] == AC[i])
+        continue;
+      if (FDi == -1)
+        FDi = i;
+      LDi = i;
+    }
+    if (FDi == -1)
+      continue;  // TODO (future): Record identical registers.
+    // Look for a register whose prefix could patch the range [FD..LD]
+    // where VR and SrcR differ.
+    uint16_t FD = FDi, LD = LDi;  // Switch to unsigned type.
+    uint16_t MinL = LD-FD+1;
+    for (uint16_t L = MinL; L < W; ++L) {
+      LRSMapType::iterator F = LM.find(L);
+      if (F == LM.end())
+        continue;
+      RSListType &LL = F->second;
+      for (unsigned i = 0, n = LL.size(); i < n; ++i) {
+        uint16_t S = LL[i].second;
+        // MinL is the minimum length of the prefix. Any length above MinL
+        // allows some flexibility as to where the prefix can start:
+        // given the extra length EL=L-MinL, the prefix must start between
+        // max(0,FD-EL) and FD.
+        if (S > FD)   // Starts too late.
+          continue;
+        uint16_t EL = L-MinL;
+        uint16_t LowS = (EL < FD) ? FD-EL : 0;
+        if (S < LowS) // Starts too early.
+          continue;
+        unsigned InsR = LL[i].first;
+        if (!isValidInsertForm(VR, SrcR, InsR, L, S))
+          continue;
+        if (isDebug()) {
+          dbgs() << PrintReg(VR, HRI) << " = insert(" << PrintReg(SrcR, HRI)
+                 << ',' << PrintReg(InsR, HRI) << ",#" << L << ",#"
+                 << S << ")\n";
+        }
+        IFRecordWithRegSet RR(IFRecord(SrcR, InsR, L, S), RegisterSet());
+        IFMap[VR].push_back(RR);
+        Recorded = true;
+      }
+    }
+  }
+
+  return Recorded;
+}
+
+
+void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
+      OrderedRegisterList &AVs) {
+  if (isDebug())
+    dbgs() << "visiting block BB#" << B->getNumber() << "\n";
+
+  // First, check if this block is reachable at all. If not, the bit tracker
+  // will not have any information about registers in it.
+  if (!CMS->BT.reached(B))
+    return;
+
+  bool DoConst = OptConst;
+  // Keep a separate set of registers defined in this block, so that we
+  // can remove them from the list of available registers once all DT
+  // successors have been processed.
+  RegisterSet BlockDefs, InsDefs;
+  for (MachineBasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) {
+    MachineInstr *MI = &*I;
+    InsDefs.clear();
+    getInstrDefs(MI, InsDefs);
+    // Leave those alone. They are more transparent than "insert".
+    bool Skip = MI->isCopy() || MI->isRegSequence();
+
+    if (!Skip) {
+      // Visit all defined registers, and attempt to find the corresponding
+      // "insert" representations.
+      for (unsigned VR = InsDefs.find_first(); VR; VR = InsDefs.find_next(VR)) {
+        // Do not collect registers that are known to be compile-time cons-
+        // tants, unless requested.
+        if (!DoConst && isConstant(VR))
+          continue;
+        // If VR's cell contains a reference to VR, then VR cannot be defined
+        // via "insert". If VR is a constant that can be generated in a single
+        // instruction (without constant extenders), generating it via insert
+        // makes no sense.
+        if (findSelfReference(VR) || isSmallConstant(VR))
+          continue;
+
+        findRecordInsertForms(VR, AVs);
+      }
+    }
+
+    // Insert the defined registers into the list of available registers
+    // after they have been processed.
+    for (unsigned VR = InsDefs.find_first(); VR; VR = InsDefs.find_next(VR))
+      AVs.insert(VR);
+    BlockDefs.insert(InsDefs);
+  }
+
+  MachineDomTreeNode *N = MDT->getNode(B);
+  typedef GraphTraits<MachineDomTreeNode*> GTN;
+  typedef GTN::ChildIteratorType ChildIter;
+  for (ChildIter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
+    MachineBasicBlock *SB = (*I)->getBlock();
+    collectInBlock(SB, AVs);
+  }
+
+  for (unsigned VR = BlockDefs.find_first(); VR; VR = BlockDefs.find_next(VR))
+    AVs.remove(VR);
+}
+
+
+void HexagonGenInsert::findRemovableRegisters(unsigned VR, IFRecord IF,
+      RegisterSet &RMs) const {
+  // For a given register VR and a insert form, find the registers that are
+  // used by the current definition of VR, and which would no longer be
+  // needed for it after the definition of VR is replaced with the insert
+  // form. These are the registers that could potentially become dead.
+  RegisterSet Regs[2];
+
+  unsigned S = 0;  // Register set selector.
+  Regs[S].insert(VR);
+
+  while (!Regs[S].empty()) {
+    // Breadth-first search.
+    unsigned OtherS = 1-S;
+    Regs[OtherS].clear();
+    for (unsigned R = Regs[S].find_first(); R; R = Regs[S].find_next(R)) {
+      Regs[S].remove(R);
+      if (R == IF.SrcR || R == IF.InsR)
+        continue;
+      // Check if a given register has bits that are references to any other
+      // registers. This is to detect situations where the instruction that
+      // defines register R takes register Q as an operand, but R itself does
+      // not contain any bits from Q. Loads are examples of how this could
+      // happen:
+      //   R = load Q
+      // In this case (assuming we do not have any knowledge about the loaded
+      // value), we must not treat R as a "conveyance" of the bits from Q.
+      // (The information in BT about R's bits would have them as constants,
+      // in case of zero-extending loads, or refs to R.)
+      if (!findNonSelfReference(R))
+        continue;
+      RMs.insert(R);
+      const MachineInstr *DefI = MRI->getVRegDef(R);
+      assert(DefI);
+      // Do not iterate past PHI nodes to avoid infinite loops. This can
+      // make the final set a bit less accurate, but the removable register
+      // sets are an approximation anyway.
+      if (DefI->isPHI())
+        continue;
+      getInstrUses(DefI, Regs[OtherS]);
+    }
+    S = OtherS;
+  }
+  // The register VR is added to the list as a side-effect of the algorithm,
+  // but it is not "potentially removable". A potentially removable register
+  // is one that may become unused (dead) after conversion to the insert form
+  // IF, and obviously VR (or its replacement) will not become dead by apply-
+  // ing IF.
+  RMs.remove(VR);
+}
+
+
+void HexagonGenInsert::computeRemovableRegisters() {
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    IFListType &LL = I->second;
+    for (unsigned i = 0, n = LL.size(); i < n; ++i)
+      findRemovableRegisters(I->first, LL[i].first, LL[i].second);
+  }
+}
+
+
+void HexagonGenInsert::pruneEmptyLists() {
+  // Remove all entries from the map, where the register has no insert forms
+  // associated with it.
+  typedef SmallVector<IFMapType::iterator,16> IterListType;
+  IterListType Prune;
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    if (I->second.size() == 0)
+      Prune.push_back(I);
+  }
+  for (unsigned i = 0, n = Prune.size(); i < n; ++i)
+    IFMap.erase(Prune[i]);
+}
+
+
+void HexagonGenInsert::pruneCoveredSets(unsigned VR) {
+  IFMapType::iterator F = IFMap.find(VR);
+  assert(F != IFMap.end());
+  IFListType &LL = F->second;
+
+  // First, examine the IF candidates for register VR whose removable-regis-
+  // ter sets are empty. This means that a given candidate will not help eli-
+  // minate any registers, but since "insert" is not a constant-extendable
+  // instruction, using such a candidate may reduce code size if the defini-
+  // tion of VR is constant-extended.
+  // If there exists a candidate with a non-empty set, the ones with empty
+  // sets will not be used and can be removed.
+  MachineInstr *DefVR = MRI->getVRegDef(VR);
+  bool DefEx = HII->isConstExtended(DefVR);
+  bool HasNE = false;
+  for (unsigned i = 0, n = LL.size(); i < n; ++i) {
+    if (LL[i].second.empty())
+      continue;
+    HasNE = true;
+    break;
+  }
+  if (!DefEx || HasNE) {
+    // The definition of VR is not constant-extended, or there is a candidate
+    // with a non-empty set. Remove all candidates with empty sets.
+    auto IsEmpty = [] (const IFRecordWithRegSet &IR) -> bool {
+      return IR.second.empty();
+    };
+    auto End = std::remove_if(LL.begin(), LL.end(), IsEmpty);
+    if (End != LL.end())
+      LL.erase(End, LL.end());
+  } else {
+    // The definition of VR is constant-extended, and all candidates have
+    // empty removable-register sets. Pick the maximum candidate, and remove
+    // all others. The "maximum" does not have any special meaning here, it
+    // is only so that the candidate that will remain on the list is selec-
+    // ted deterministically.
+    IFRecord MaxIF = LL[0].first;
+    for (unsigned i = 1, n = LL.size(); i < n; ++i) {
+      // If LL[MaxI] < LL[i], then MaxI = i.
+      const IFRecord &IF = LL[i].first;
+      unsigned M0 = BaseOrd[MaxIF.SrcR], M1 = BaseOrd[MaxIF.InsR];
+      unsigned R0 = BaseOrd[IF.SrcR], R1 = BaseOrd[IF.InsR];
+      if (M0 > R0)
+        continue;
+      if (M0 == R0) {
+        if (M1 > R1)
+          continue;
+        if (M1 == R1) {
+          if (MaxIF.Wdh > IF.Wdh)
+            continue;
+          if (MaxIF.Wdh == IF.Wdh && MaxIF.Off >= IF.Off)
+            continue;
+        }
+      }
+      // MaxIF < IF.
+      MaxIF = IF;
+    }
+    // Remove everything except the maximum candidate. All register sets
+    // are empty, so no need to preserve anything.
+    LL.clear();
+    LL.push_back(std::make_pair(MaxIF, RegisterSet()));
+  }
+
+  // Now, remove those whose sets of potentially removable registers are
+  // contained in another IF candidate for VR. For example, given these
+  // candidates for vreg45,
+  //   %vreg45:
+  //     (%vreg44,%vreg41,#9,#8), { %vreg42 }
+  //     (%vreg43,%vreg41,#9,#8), { %vreg42 %vreg44 }
+  // remove the first one, since it is contained in the second one.
+  for (unsigned i = 0, n = LL.size(); i < n; ) {
+    const RegisterSet &RMi = LL[i].second;
+    unsigned j = 0;
+    while (j < n) {
+      if (j != i && LL[j].second.includes(RMi))
+        break;
+      j++;
+    }
+    if (j == n) {   // RMi not contained in anything else.
+      i++;
+      continue;
+    }
+    LL.erase(LL.begin()+i);
+    n = LL.size();
+  }
+}
+
+
+void HexagonGenInsert::pruneUsesTooFar(unsigned VR, const UnsignedMap &RPO,
+      PairMapType &M) {
+  IFMapType::iterator F = IFMap.find(VR);
+  assert(F != IFMap.end());
+  IFListType &LL = F->second;
+  unsigned Cutoff = VRegDistCutoff;
+  const MachineInstr *DefV = MRI->getVRegDef(VR);
+
+  for (unsigned i = LL.size(); i > 0; --i) {
+    unsigned SR = LL[i-1].first.SrcR, IR = LL[i-1].first.InsR;
+    const MachineInstr *DefS = MRI->getVRegDef(SR);
+    const MachineInstr *DefI = MRI->getVRegDef(IR);
+    unsigned DSV = distance(DefS, DefV, RPO, M);
+    if (DSV < Cutoff) {
+      unsigned DIV = distance(DefI, DefV, RPO, M);
+      if (DIV < Cutoff)
+        continue;
+    }
+    LL.erase(LL.begin()+(i-1));
+  }
+}
+
+
+void HexagonGenInsert::pruneRegCopies(unsigned VR) {
+  IFMapType::iterator F = IFMap.find(VR);
+  assert(F != IFMap.end());
+  IFListType &LL = F->second;
+
+  auto IsCopy = [] (const IFRecordWithRegSet &IR) -> bool {
+    return IR.first.Wdh == 32 && (IR.first.Off == 0 || IR.first.Off == 32);
+  };
+  auto End = std::remove_if(LL.begin(), LL.end(), IsCopy);
+  if (End != LL.end())
+    LL.erase(End, LL.end());
+}
+
+
+void HexagonGenInsert::pruneCandidates() {
+  // Remove candidates that are not beneficial, regardless of the final
+  // selection method.
+  // First, remove candidates whose potentially removable set is a subset
+  // of another candidate's set.
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
+    pruneCoveredSets(I->first);
+
+  UnsignedMap RPO;
+  typedef ReversePostOrderTraversal<const MachineFunction*> RPOTType;
+  RPOTType RPOT(MFN);
+  unsigned RPON = 0;
+  for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
+    RPO[(*I)->getNumber()] = RPON++;
+
+  PairMapType Memo; // Memoization map for distance calculation.
+  // Remove candidates that would use registers defined too far away.
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
+    pruneUsesTooFar(I->first, RPO, Memo);
+
+  pruneEmptyLists();
+
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
+    pruneRegCopies(I->first);
+}
+
+
+namespace {
+  // Class for comparing IF candidates for registers that have multiple of
+  // them. The smaller the candidate, according to this ordering, the better.
+  // First, compare the number of zeros in the associated potentially remova-
+  // ble register sets. "Zero" indicates that the register is very likely to
+  // become dead after this transformation.
+  // Second, compare "averages", i.e. use-count per size. The lower wins.
+  // After that, it does not really matter which one is smaller. Resolve
+  // the tie in some deterministic way.
+  struct IFOrdering {
+    IFOrdering(const UnsignedMap &UC, const RegisterOrdering &BO)
+      : UseC(UC), BaseOrd(BO) {}
+    bool operator() (const IFRecordWithRegSet &A,
+          const IFRecordWithRegSet &B) const;
+  private:
+    void stats(const RegisterSet &Rs, unsigned &Size, unsigned &Zero,
+          unsigned &Sum) const;
+    const UnsignedMap &UseC;
+    const RegisterOrdering &BaseOrd;
+  };
+}
+
+
+bool IFOrdering::operator() (const IFRecordWithRegSet &A,
+      const IFRecordWithRegSet &B) const {
+  unsigned SizeA = 0, ZeroA = 0, SumA = 0;
+  unsigned SizeB = 0, ZeroB = 0, SumB = 0;
+  stats(A.second, SizeA, ZeroA, SumA);
+  stats(B.second, SizeB, ZeroB, SumB);
+
+  // We will pick the minimum element. The more zeros, the better.
+  if (ZeroA != ZeroB)
+    return ZeroA > ZeroB;
+  // Compare SumA/SizeA with SumB/SizeB, lower is better.
+  uint64_t AvgA = SumA*SizeB, AvgB = SumB*SizeA;
+  if (AvgA != AvgB)
+    return AvgA < AvgB;
+
+  // The sets compare identical so far. Resort to comparing the IF records.
+  // The actual values don't matter, this is only for determinism.
+  unsigned OSA = BaseOrd[A.first.SrcR], OSB = BaseOrd[B.first.SrcR];
+  if (OSA != OSB)
+    return OSA < OSB;
+  unsigned OIA = BaseOrd[A.first.InsR], OIB = BaseOrd[B.first.InsR];
+  if (OIA != OIB)
+    return OIA < OIB;
+  if (A.first.Wdh != B.first.Wdh)
+    return A.first.Wdh < B.first.Wdh;
+  return A.first.Off < B.first.Off;
+}
+
+
+void IFOrdering::stats(const RegisterSet &Rs, unsigned &Size, unsigned &Zero,
+      unsigned &Sum) const {
+  for (unsigned R = Rs.find_first(); R; R = Rs.find_next(R)) {
+    UnsignedMap::const_iterator F = UseC.find(R);
+    assert(F != UseC.end());
+    unsigned UC = F->second;
+    if (UC == 0)
+      Zero++;
+    Sum += UC;
+    Size++;
+  }
+}
+
+
+void HexagonGenInsert::selectCandidates() {
+  // Some registers may have multiple valid candidates. Pick the best one
+  // (or decide not to use any).
+
+  // Compute the "removability" measure of R:
+  // For each potentially removable register R, record the number of regis-
+  // ters with IF candidates, where R appears in at least one set.
+  RegisterSet AllRMs;
+  UnsignedMap UseC, RemC;
+  IFMapType::iterator End = IFMap.end();
+
+  for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+    const IFListType &LL = I->second;
+    RegisterSet TT;
+    for (unsigned i = 0, n = LL.size(); i < n; ++i)
+      TT.insert(LL[i].second);
+    for (unsigned R = TT.find_first(); R; R = TT.find_next(R))
+      RemC[R]++;
+    AllRMs.insert(TT);
+  }
+
+  for (unsigned R = AllRMs.find_first(); R; R = AllRMs.find_next(R)) {
+    typedef MachineRegisterInfo::use_nodbg_iterator use_iterator;
+    typedef SmallSet<const MachineInstr*,16> InstrSet;
+    InstrSet UIs;
+    // Count as the number of instructions in which R is used, not the
+    // number of operands.
+    use_iterator E = MRI->use_nodbg_end();
+    for (use_iterator I = MRI->use_nodbg_begin(R); I != E; ++I)
+      UIs.insert(I->getParent());
+    unsigned C = UIs.size();
+    // Calculate a measure, which is the number of instructions using R,
+    // minus the "removability" count computed earlier.
+    unsigned D = RemC[R];
+    UseC[R] = (C > D) ? C-D : 0;  // doz
+  }
+
+
+  bool SelectAll0 = OptSelectAll0, SelectHas0 = OptSelectHas0;
+  if (!SelectAll0 && !SelectHas0)
+    SelectAll0 = true;
+
+  // The smaller the number UseC for a given register R, the "less used"
+  // R is aside from the opportunities for removal offered by generating
+  // "insert" instructions.
+  // Iterate over the IF map, and for those registers that have multiple
+  // candidates, pick the minimum one according to IFOrdering.
+  IFOrdering IFO(UseC, BaseOrd);
+  for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+    IFListType &LL = I->second;
+    if (LL.empty())
+      continue;
+    // Get the minimum element, remember it and clear the list. If the
+    // element found is adequate, we will put it back on the list, other-
+    // wise the list will remain empty, and the entry for this register
+    // will be removed (i.e. this register will not be replaced by insert).
+    IFListType::iterator MinI = std::min_element(LL.begin(), LL.end(), IFO);
+    assert(MinI != LL.end());
+    IFRecordWithRegSet M = *MinI;
+    LL.clear();
+
+    // We want to make sure that this replacement will have a chance to be
+    // beneficial, and that means that we want to have indication that some
+    // register will be removed. The most likely registers to be eliminated
+    // are the use operands in the definition of I->first. Accept/reject a
+    // candidate based on how many of its uses it can potentially eliminate.
+
+    RegisterSet Us;
+    const MachineInstr *DefI = MRI->getVRegDef(I->first);
+    getInstrUses(DefI, Us);
+    bool Accept = false;
+
+    if (SelectAll0) {
+      bool All0 = true;
+      for (unsigned R = Us.find_first(); R; R = Us.find_next(R)) {
+        if (UseC[R] == 0)
+          continue;
+        All0 = false;
+        break;
+      }
+      Accept = All0;
+    } else if (SelectHas0) {
+      bool Has0 = false;
+      for (unsigned R = Us.find_first(); R; R = Us.find_next(R)) {
+        if (UseC[R] != 0)
+          continue;
+        Has0 = true;
+        break;
+      }
+      Accept = Has0;
+    }
+    if (Accept)
+      LL.push_back(M);
+  }
+
+  // Remove candidates that add uses of removable registers, unless the
+  // removable registers are among replacement candidates.
+  // Recompute the removable registers, since some candidates may have
+  // been eliminated.
+  AllRMs.clear();
+  for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+    const IFListType &LL = I->second;
+    if (LL.size() > 0)
+      AllRMs.insert(LL[0].second);
+  }
+  for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+    IFListType &LL = I->second;
+    if (LL.size() == 0)
+      continue;
+    unsigned SR = LL[0].first.SrcR, IR = LL[0].first.InsR;
+    if (AllRMs[SR] || AllRMs[IR])
+      LL.clear();
+  }
+
+  pruneEmptyLists();
+}
+
+
+bool HexagonGenInsert::generateInserts() {
+  // Create a new register for each one from IFMap, and store them in the
+  // map.
+  UnsignedMap RegMap;
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    unsigned VR = I->first;
+    const TargetRegisterClass *RC = MRI->getRegClass(VR);
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    RegMap[VR] = NewVR;
+  }
+
+  // We can generate the "insert" instructions using potentially stale re-
+  // gisters: SrcR and InsR for a given VR may be among other registers that
+  // are also replaced. This is fine, we will do the mass "rauw" a bit later.
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    MachineInstr *MI = MRI->getVRegDef(I->first);
+    MachineBasicBlock &B = *MI->getParent();
+    DebugLoc DL = MI->getDebugLoc();
+    unsigned NewR = RegMap[I->first];
+    bool R32 = MRI->getRegClass(NewR) == &Hexagon::IntRegsRegClass;
+    const MCInstrDesc &D = R32 ? HII->get(Hexagon::S2_insert)
+                               : HII->get(Hexagon::S2_insertp);
+    IFRecord IF = I->second[0].first;
+    unsigned Wdh = IF.Wdh, Off = IF.Off;
+    unsigned InsS = 0;
+    if (R32 && MRI->getRegClass(IF.InsR) == &Hexagon::DoubleRegsRegClass) {
+      InsS = Hexagon::subreg_loreg;
+      if (Off >= 32) {
+        InsS = Hexagon::subreg_hireg;
+        Off -= 32;
+      }
+    }
+    // Advance to the proper location for inserting instructions. This could
+    // be B.end().
+    MachineBasicBlock::iterator At = MI;
+    if (MI->isPHI())
+      At = B.getFirstNonPHI();
+
+    BuildMI(B, At, DL, D, NewR)
+      .addReg(IF.SrcR)
+      .addReg(IF.InsR, 0, InsS)
+      .addImm(Wdh)
+      .addImm(Off);
+
+    MRI->clearKillFlags(IF.SrcR);
+    MRI->clearKillFlags(IF.InsR);
+  }
+
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    MachineInstr *DefI = MRI->getVRegDef(I->first);
+    MRI->replaceRegWith(I->first, RegMap[I->first]);
+    DefI->eraseFromParent();
+  }
+
+  return true;
+}
+
+
+bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
+  bool Changed = false;
+  typedef GraphTraits<MachineDomTreeNode*> GTN;
+  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
+    Changed |= removeDeadCode(*I);
+
+  MachineBasicBlock *B = N->getBlock();
+  std::vector<MachineInstr*> Instrs;
+  for (auto I = B->rbegin(), E = B->rend(); I != E; ++I)
+    Instrs.push_back(&*I);
+
+  for (auto I = Instrs.begin(), E = Instrs.end(); I != E; ++I) {
+    MachineInstr *MI = *I;
+    unsigned Opc = MI->getOpcode();
+    // Do not touch lifetime markers. This is why the target-independent DCE
+    // cannot be used.
+    if (Opc == TargetOpcode::LIFETIME_START ||
+        Opc == TargetOpcode::LIFETIME_END)
+      continue;
+    bool Store = false;
+    if (MI->isInlineAsm() || !MI->isSafeToMove(nullptr, Store))
+      continue;
+
+    bool AllDead = true;
+    SmallVector<unsigned,2> Regs;
+    for (ConstMIOperands Op(MI); Op.isValid(); ++Op) {
+      if (!Op->isReg() || !Op->isDef())
+        continue;
+      unsigned R = Op->getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(R) ||
+          !MRI->use_nodbg_empty(R)) {
+        AllDead = false;
+        break;
+      }
+      Regs.push_back(R);
+    }
+    if (!AllDead)
+      continue;
+
+    B->erase(MI);
+    for (unsigned I = 0, N = Regs.size(); I != N; ++I)
+      MRI->markUsesInDebugValueAsUndef(Regs[I]);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+
+bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
+  bool Timing = OptTiming, TimingDetail = Timing && OptTimingDetail;
+  bool Changed = false;
+  TimerGroup __G("hexinsert");
+  NamedRegionTimer __T("hexinsert", Timing && !TimingDetail);
+
+  // Sanity check: one, but not both.
+  assert(!OptSelectAll0 || !OptSelectHas0);
+
+  IFMap.clear();
+  BaseOrd.clear();
+  CellOrd.clear();
+
+  const auto &ST = MF.getSubtarget<HexagonSubtarget>();
+  HII = ST.getInstrInfo();
+  HRI = ST.getRegisterInfo();
+  MFN = &MF;
+  MRI = &MF.getRegInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+
+  // Clean up before any further processing, so that dead code does not
+  // get used in a newly generated "insert" instruction. Have a custom
+  // version of DCE that preserves lifetime markers. Without it, merging
+  // of stack objects can fail to recognize and merge disjoint objects
+  // leading to unnecessary stack growth.
+  Changed |= removeDeadCode(MDT->getRootNode());
+
+  const HexagonEvaluator HE(*HRI, *MRI, *HII, MF);
+  BitTracker BTLoc(HE, MF);
+  BTLoc.trace(isDebug());
+  BTLoc.run();
+  CellMapShadow MS(BTLoc);
+  CMS = &MS;
+
+  buildOrderingMF(BaseOrd);
+  buildOrderingBT(BaseOrd, CellOrd);
+
+  if (isDebug()) {
+    dbgs() << "Cell ordering:\n";
+    for (RegisterOrdering::iterator I = CellOrd.begin(), E = CellOrd.end();
+        I != E; ++I) {
+      unsigned VR = I->first, Pos = I->second;
+      dbgs() << PrintReg(VR, HRI) << " -> " << Pos << "\n";
+    }
+  }
+
+  // Collect candidates for conversion into the insert forms.
+  MachineBasicBlock *RootB = MDT->getRoot();
+  OrderedRegisterList AvailR(CellOrd);
+
+  {
+    NamedRegionTimer _T("collection", "hexinsert", TimingDetail);
+    collectInBlock(RootB, AvailR);
+    // Complete the information gathered in IFMap.
+    computeRemovableRegisters();
+  }
+
+  if (isDebug()) {
+    dbgs() << "Candidates after collection:\n";
+    dump_map();
+  }
+
+  if (IFMap.empty())
+    return false;
+
+  {
+    NamedRegionTimer _T("pruning", "hexinsert", TimingDetail);
+    pruneCandidates();
+  }
+
+  if (isDebug()) {
+    dbgs() << "Candidates after pruning:\n";
+    dump_map();
+  }
+
+  if (IFMap.empty())
+    return false;
+
+  {
+    NamedRegionTimer _T("selection", "hexinsert", TimingDetail);
+    selectCandidates();
+  }
+
+  if (isDebug()) {
+    dbgs() << "Candidates after selection:\n";
+    dump_map();
+  }
+
+  // Filter out vregs beyond the cutoff.
+  if (VRegIndexCutoff.getPosition()) {
+    unsigned Cutoff = VRegIndexCutoff;
+    typedef SmallVector<IFMapType::iterator,16> IterListType;
+    IterListType Out;
+    for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+      unsigned Idx = TargetRegisterInfo::virtReg2Index(I->first);
+      if (Idx >= Cutoff)
+        Out.push_back(I);
+    }
+    for (unsigned i = 0, n = Out.size(); i < n; ++i)
+      IFMap.erase(Out[i]);
+  }
+
+  {
+    NamedRegionTimer _T("generation", "hexinsert", TimingDetail);
+    Changed = generateInserts();
+  }
+
+  return Changed;
+}
+
+
+FunctionPass *llvm::createHexagonGenInsert() {
+  return new HexagonGenInsert();
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+INITIALIZE_PASS_BEGIN(HexagonGenInsert, "hexinsert",
+  "Hexagon generate \"insert\" instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(HexagonGenInsert, "hexinsert",
+  "Hexagon generate \"insert\" instructions", false, false)
diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp
new file mode 100644
index 000000000000..6905c4f6d125
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -0,0 +1,525 @@
+//===--- HexagonGenPredicate.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "gen-pred"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "HexagonTargetMachine.h"
+
+#include <functional>
+#include <queue>
+#include <set>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeHexagonGenPredicatePass(PassRegistry& Registry);
+  FunctionPass *createHexagonGenPredicate();
+}
+
+namespace {
+  struct Register {
+    unsigned R, S;
+    Register(unsigned r = 0, unsigned s = 0) : R(r), S(s) {}
+    Register(const MachineOperand &MO) : R(MO.getReg()), S(MO.getSubReg()) {}
+    bool operator== (const Register &Reg) const {
+      return R == Reg.R && S == Reg.S;
+    }
+    bool operator< (const Register &Reg) const {
+      return R < Reg.R || (R == Reg.R && S < Reg.S);
+    }
+  };
+  struct PrintRegister {
+    PrintRegister(Register R, const TargetRegisterInfo &I) : Reg(R), TRI(I) {}
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR);
+  private:
+    Register Reg;
+    const TargetRegisterInfo &TRI;
+  };
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR)
+    LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR) {
+    return OS << PrintReg(PR.Reg.R, &PR.TRI, PR.Reg.S);
+  }
+
+  class HexagonGenPredicate : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonGenPredicate() : MachineFunctionPass(ID), TII(0), TRI(0), MRI(0) {
+      initializeHexagonGenPredicatePass(*PassRegistry::getPassRegistry());
+    }
+    virtual const char *getPassName() const {
+      return "Hexagon generate predicate operations";
+    }
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  private:
+    typedef SetVector<MachineInstr*> VectOfInst;
+    typedef std::set<Register> SetOfReg;
+    typedef std::map<Register,Register> RegToRegMap;
+
+    const HexagonInstrInfo *TII;
+    const HexagonRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+    SetOfReg PredGPRs;
+    VectOfInst PUsers;
+    RegToRegMap G2P;
+
+    bool isPredReg(unsigned R);
+    void collectPredicateGPR(MachineFunction &MF);
+    void processPredicateGPR(const Register &Reg);
+    unsigned getPredForm(unsigned Opc);
+    bool isConvertibleToPredForm(const MachineInstr *MI);
+    bool isScalarCmp(unsigned Opc);
+    bool isScalarPred(Register PredReg);
+    Register getPredRegFor(const Register &Reg);
+    bool convertToPredForm(MachineInstr *MI);
+    bool eliminatePredCopies(MachineFunction &MF);
+  };
+
+  char HexagonGenPredicate::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(HexagonGenPredicate, "hexagon-gen-pred",
+  "Hexagon generate predicate operations", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred",
+  "Hexagon generate predicate operations", false, false)
+
+bool HexagonGenPredicate::isPredReg(unsigned R) {
+  if (!TargetRegisterInfo::isVirtualRegister(R))
+    return false;
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  return RC == &Hexagon::PredRegsRegClass;
+}
+
+
+unsigned HexagonGenPredicate::getPredForm(unsigned Opc) {
+  using namespace Hexagon;
+
+  switch (Opc) {
+    case A2_and:
+    case A2_andp:
+      return C2_and;
+    case A4_andn:
+    case A4_andnp:
+      return C2_andn;
+    case M4_and_and:
+      return C4_and_and;
+    case M4_and_andn:
+      return C4_and_andn;
+    case M4_and_or:
+      return C4_and_or;
+
+    case A2_or:
+    case A2_orp:
+      return C2_or;
+    case A4_orn:
+    case A4_ornp:
+      return C2_orn;
+    case M4_or_and:
+      return C4_or_and;
+    case M4_or_andn:
+      return C4_or_andn;
+    case M4_or_or:
+      return C4_or_or;
+
+    case A2_xor:
+    case A2_xorp:
+      return C2_xor;
+
+    case C2_tfrrp:
+      return COPY;
+  }
+  // The opcode corresponding to 0 is TargetOpcode::PHI. We can use 0 here
+  // to denote "none", but we need to make sure that none of the valid opcodes
+  // that we return will ever be 0.
+  assert(PHI == 0 && "Use different value for <none>");
+  return 0;
+}
+
+
+bool HexagonGenPredicate::isConvertibleToPredForm(const MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  if (getPredForm(Opc) != 0)
+    return true;
+
+  // Comparisons against 0 are also convertible. This does not apply to
+  // A4_rcmpeqi or A4_rcmpneqi, since they produce values 0 or 1, which
+  // may not match the value that the predicate register would have if
+  // it was converted to a predicate form.
+  switch (Opc) {
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C4_cmpneqi:
+      if (MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0)
+        return true;
+      break;
+  }
+  return false;
+}
+
+
+void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
+  for (MachineFunction::iterator A = MF.begin(), Z = MF.end(); A != Z; ++A) {
+    MachineBasicBlock &B = *A;
+    for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
+      MachineInstr *MI = &*I;
+      unsigned Opc = MI->getOpcode();
+      switch (Opc) {
+        case Hexagon::C2_tfrpr:
+        case TargetOpcode::COPY:
+          if (isPredReg(MI->getOperand(1).getReg())) {
+            Register RD = MI->getOperand(0);
+            if (TargetRegisterInfo::isVirtualRegister(RD.R))
+              PredGPRs.insert(RD);
+          }
+          break;
+      }
+    }
+  }
+}
+
+
+void HexagonGenPredicate::processPredicateGPR(const Register &Reg) {
+  DEBUG(dbgs() << LLVM_FUNCTION_NAME << ": "
+               << PrintReg(Reg.R, TRI, Reg.S) << "\n");
+  typedef MachineRegisterInfo::use_iterator use_iterator;
+  use_iterator I = MRI->use_begin(Reg.R), E = MRI->use_end();
+  if (I == E) {
+    DEBUG(dbgs() << "Dead reg: " << PrintReg(Reg.R, TRI, Reg.S) << '\n');
+    MachineInstr *DefI = MRI->getVRegDef(Reg.R);
+    DefI->eraseFromParent();
+    return;
+  }
+
+  for (; I != E; ++I) {
+    MachineInstr *UseI = I->getParent();
+    if (isConvertibleToPredForm(UseI))
+      PUsers.insert(UseI);
+  }
+}
+
+
+Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
+  // Create a predicate register for a given Reg. The newly created register
+  // will have its value copied from Reg, so that it can be later used as
+  // an operand in other instructions.
+  assert(TargetRegisterInfo::isVirtualRegister(Reg.R));
+  RegToRegMap::iterator F = G2P.find(Reg);
+  if (F != G2P.end())
+    return F->second;
+
+  DEBUG(dbgs() << LLVM_FUNCTION_NAME << ": " << PrintRegister(Reg, *TRI));
+  MachineInstr *DefI = MRI->getVRegDef(Reg.R);
+  assert(DefI);
+  unsigned Opc = DefI->getOpcode();
+  if (Opc == Hexagon::C2_tfrpr || Opc == TargetOpcode::COPY) {
+    assert(DefI->getOperand(0).isDef() && DefI->getOperand(1).isUse());
+    Register PR = DefI->getOperand(1);
+    G2P.insert(std::make_pair(Reg, PR));
+    DEBUG(dbgs() << " -> " << PrintRegister(PR, *TRI) << '\n');
+    return PR;
+  }
+
+  MachineBasicBlock &B = *DefI->getParent();
+  DebugLoc DL = DefI->getDebugLoc();
+  const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+  unsigned NewPR = MRI->createVirtualRegister(PredRC);
+
+  // For convertible instructions, do not modify them, so that they can
+  // be coverted later.  Generate a copy from Reg to NewPR.
+  if (isConvertibleToPredForm(DefI)) {
+    MachineBasicBlock::iterator DefIt = DefI;
+    BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR)
+      .addReg(Reg.R, 0, Reg.S);
+    G2P.insert(std::make_pair(Reg, Register(NewPR)));
+    DEBUG(dbgs() << " -> !" << PrintRegister(Register(NewPR), *TRI) << '\n');
+    return Register(NewPR);
+  }
+
+  llvm_unreachable("Invalid argument");
+}
+
+
+bool HexagonGenPredicate::isScalarCmp(unsigned Opc) {
+  switch (Opc) {
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpgt:
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpeqp:
+    case Hexagon::C2_cmpgtp:
+    case Hexagon::C2_cmpgtup:
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::C2_cmpgtui:
+    case Hexagon::C2_cmpgei:
+    case Hexagon::C2_cmpgeui:
+    case Hexagon::C4_cmpneqi:
+    case Hexagon::C4_cmpltei:
+    case Hexagon::C4_cmplteui:
+    case Hexagon::C4_cmpneq:
+    case Hexagon::C4_cmplte:
+    case Hexagon::C4_cmplteu:
+    case Hexagon::A4_cmpbeq:
+    case Hexagon::A4_cmpbeqi:
+    case Hexagon::A4_cmpbgtu:
+    case Hexagon::A4_cmpbgtui:
+    case Hexagon::A4_cmpbgt:
+    case Hexagon::A4_cmpbgti:
+    case Hexagon::A4_cmpheq:
+    case Hexagon::A4_cmphgt:
+    case Hexagon::A4_cmphgtu:
+    case Hexagon::A4_cmpheqi:
+    case Hexagon::A4_cmphgti:
+    case Hexagon::A4_cmphgtui:
+      return true;
+  }
+  return false;
+}
+
+
+bool HexagonGenPredicate::isScalarPred(Register PredReg) {
+  std::queue<Register> WorkQ;
+  WorkQ.push(PredReg);
+
+  while (!WorkQ.empty()) {
+    Register PR = WorkQ.front();
+    WorkQ.pop();
+    const MachineInstr *DefI = MRI->getVRegDef(PR.R);
+    if (!DefI)
+      return false;
+    unsigned DefOpc = DefI->getOpcode();
+    switch (DefOpc) {
+      case TargetOpcode::COPY: {
+        const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+        if (MRI->getRegClass(PR.R) != PredRC)
+          return false;
+        // If it is a copy between two predicate registers, fall through.
+      }
+      case Hexagon::C2_and:
+      case Hexagon::C2_andn:
+      case Hexagon::C4_and_and:
+      case Hexagon::C4_and_andn:
+      case Hexagon::C4_and_or:
+      case Hexagon::C2_or:
+      case Hexagon::C2_orn:
+      case Hexagon::C4_or_and:
+      case Hexagon::C4_or_andn:
+      case Hexagon::C4_or_or:
+      case Hexagon::C4_or_orn:
+      case Hexagon::C2_xor:
+        // Add operands to the queue.
+        for (ConstMIOperands Mo(DefI); Mo.isValid(); ++Mo)
+          if (Mo->isReg() && Mo->isUse())
+            WorkQ.push(Register(Mo->getReg()));
+        break;
+
+      // All non-vector compares are ok, everything else is bad.
+      default:
+        return isScalarCmp(DefOpc);
+    }
+  }
+
+  return true;
+}
+
+
+bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
+  DEBUG(dbgs() << LLVM_FUNCTION_NAME << ": " << MI << " " << *MI);
+
+  unsigned Opc = MI->getOpcode();
+  assert(isConvertibleToPredForm(MI));
+  unsigned NumOps = MI->getNumOperands();
+  for (unsigned i = 0; i < NumOps; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    Register Reg(MO);
+    if (Reg.S && Reg.S != Hexagon::subreg_loreg)
+      return false;
+    if (!PredGPRs.count(Reg))
+      return false;
+  }
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned NewOpc = getPredForm(Opc);
+  // Special case for comparisons against 0.
+  if (NewOpc == 0) {
+    switch (Opc) {
+      case Hexagon::C2_cmpeqi:
+        NewOpc = Hexagon::C2_not;
+        break;
+      case Hexagon::C4_cmpneqi:
+        NewOpc = TargetOpcode::COPY;
+        break;
+      default:
+        return false;
+    }
+
+    // If it's a scalar predicate register, then all bits in it are
+    // the same. Otherwise, to determine whether all bits are 0 or not
+    // we would need to use any8.
+    Register PR = getPredRegFor(MI->getOperand(1));
+    if (!isScalarPred(PR))
+      return false;
+    // This will skip the immediate argument when creating the predicate
+    // version instruction.
+    NumOps = 2;
+  }
+
+  // Some sanity: check that def is in operand #0.
+  MachineOperand &Op0 = MI->getOperand(0);
+  assert(Op0.isDef());
+  Register OutR(Op0);
+
+  // Don't use getPredRegFor, since it will create an association between
+  // the argument and a created predicate register (i.e. it will insert a
+  // copy if a new predicate register is created).
+  const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+  Register NewPR = MRI->createVirtualRegister(PredRC);
+  MachineInstrBuilder MIB = BuildMI(B, MI, DL, TII->get(NewOpc), NewPR.R);
+
+  // Add predicate counterparts of the GPRs.
+  for (unsigned i = 1; i < NumOps; ++i) {
+    Register GPR = MI->getOperand(i);
+    Register Pred = getPredRegFor(GPR);
+    MIB.addReg(Pred.R, 0, Pred.S);
+  }
+  DEBUG(dbgs() << "generated: " << *MIB);
+
+  // Generate a copy-out: NewGPR = NewPR, and replace all uses of OutR
+  // with NewGPR.
+  const TargetRegisterClass *RC = MRI->getRegClass(OutR.R);
+  unsigned NewOutR = MRI->createVirtualRegister(RC);
+  BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), NewOutR)
+    .addReg(NewPR.R, 0, NewPR.S);
+  MRI->replaceRegWith(OutR.R, NewOutR);
+  MI->eraseFromParent();
+
+  // If the processed instruction was C2_tfrrp (i.e. Rn = Pm; Pk = Rn),
+  // then the output will be a predicate register.  Do not visit the
+  // users of it.
+  if (!isPredReg(NewOutR)) {
+    Register R(NewOutR);
+    PredGPRs.insert(R);
+    processPredicateGPR(R);
+  }
+  return true;
+}
+
+
+bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
+  DEBUG(dbgs() << LLVM_FUNCTION_NAME << "\n");
+  const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+  bool Changed = false;
+  VectOfInst Erase;
+
+  // First, replace copies
+  //   IntR = PredR1
+  //   PredR2 = IntR
+  // with
+  //   PredR2 = PredR1
+  // Such sequences can be generated when a copy-into-pred is generated from
+  // a gpr register holding a result of a convertible instruction. After
+  // the convertible instruction is converted, its predicate result will be
+  // copied back into the original gpr.
+
+  for (MachineFunction::iterator A = MF.begin(), Z = MF.end(); A != Z; ++A) {
+    MachineBasicBlock &B = *A;
+    for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
+      if (I->getOpcode() != TargetOpcode::COPY)
+        continue;
+      Register DR = I->getOperand(0);
+      Register SR = I->getOperand(1);
+      if (!TargetRegisterInfo::isVirtualRegister(DR.R))
+        continue;
+      if (!TargetRegisterInfo::isVirtualRegister(SR.R))
+        continue;
+      if (MRI->getRegClass(DR.R) != PredRC)
+        continue;
+      if (MRI->getRegClass(SR.R) != PredRC)
+        continue;
+      assert(!DR.S && !SR.S && "Unexpected subregister");
+      MRI->replaceRegWith(DR.R, SR.R);
+      Erase.insert(I);
+      Changed = true;
+    }
+  }
+
+  for (VectOfInst::iterator I = Erase.begin(), E = Erase.end(); I != E; ++I)
+    (*I)->eraseFromParent();
+
+  return Changed;
+}
+
+
+bool HexagonGenPredicate::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  TRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  PredGPRs.clear();
+  PUsers.clear();
+  G2P.clear();
+
+  bool Changed = false;
+  collectPredicateGPR(MF);
+  for (SetOfReg::iterator I = PredGPRs.begin(), E = PredGPRs.end(); I != E; ++I)
+    processPredicateGPR(*I);
+
+  bool Again;
+  do {
+    Again = false;
+    VectOfInst Processed, Copy;
+
+    typedef VectOfInst::iterator iterator;
+    Copy = PUsers;
+    for (iterator I = Copy.begin(), E = Copy.end(); I != E; ++I) {
+      MachineInstr *MI = *I;
+      bool Done = convertToPredForm(MI);
+      if (Done) {
+        Processed.insert(MI);
+        Again = true;
+      }
+    }
+    Changed |= Again;
+
+    auto Done = [Processed] (MachineInstr *MI) -> bool {
+      return Processed.count(MI);
+    };
+    PUsers.remove_if(Done);
+  } while (Again);
+
+  Changed |= eliminatePredCopies(MF);
+  return Changed;
+}
+
+
+FunctionPass *llvm::createHexagonGenPredicate() {
+  return new HexagonGenPredicate();
+}
+
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 6e9e69f5a2c7..c739afb70c15 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -459,6 +459,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
 
   // Check for varargs.
   int NumNamedVarArgParams = -1;
@@ -515,8 +516,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<SDValue, 8> MemOpChains;
 
   auto &HRI = *Subtarget.getRegisterInfo();
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(),
-                                        getPointerTy());
+  SDValue StackPtr =
+      DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT);
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -574,7 +575,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   if (!isTailCall) {
-    SDValue C = DAG.getConstant(NumBytes, dl, getPointerTy(), true);
+    SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true);
     Chain = DAG.getCALLSEQ_START(Chain, C, dl);
   }
 
@@ -615,13 +616,13 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (flag_aligned_memcpy) {
     const char *MemcpyName =
       "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes";
-    Callee = DAG.getTargetExternalSymbol(MemcpyName, getPointerTy());
+    Callee = DAG.getTargetExternalSymbol(MemcpyName, PtrVT);
     flag_aligned_memcpy = false;
   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, getPointerTy());
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, PtrVT);
   } else if (ExternalSymbolSDNode *S =
              dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT);
   }
 
   // Returns a chain & a flag for retval copy to use.
@@ -811,8 +812,8 @@ LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
     BlockAddress::get(const_cast<BasicBlock *>(MBB->getBasicBlock()));
   }
 
-  SDValue JumpTableBase = DAG.getNode(HexagonISD::JT, dl,
-                                      getPointerTy(), TargetJT);
+  SDValue JumpTableBase = DAG.getNode(
+      HexagonISD::JT, dl, getPointerTy(DAG.getDataLayout()), TargetJT);
   SDValue ShiftIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index,
                                    DAG.getConstant(2, dl, MVT::i32));
   SDValue JTAddress = DAG.getNode(ISD::ADD, dl, MVT::i32, JumpTableBase,
@@ -1231,16 +1232,17 @@ SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
   SDLoc dl(Op);
-  Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
 
   const HexagonTargetObjectFile *TLOF =
       static_cast<const HexagonTargetObjectFile *>(
           getTargetMachine().getObjFileLowering());
   if (TLOF->IsGlobalInSmallSection(GV, getTargetMachine())) {
-    return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), Result);
+    return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, Result);
   }
 
-  return DAG.getNode(HexagonISD::CONST32, dl, getPointerTy(), Result);
+  return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, Result);
 }
 
 // Specifies that for loads and stores VT can be promoted to PromotedLdStVT.
@@ -1261,7 +1263,8 @@ HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   SDValue BA_SD =  DAG.getTargetBlockAddress(BA, MVT::i32);
   SDLoc dl(Op);
-  return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), BA_SD);
+  return DAG.getNode(HexagonISD::CONST32_GP, dl,
+                     getPointerTy(DAG.getDataLayout()), BA_SD);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2254,6 +2257,7 @@ HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
   SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Mark function as containing a call to EH_RETURN.
   HexagonMachineFunctionInfo *FuncInfo =
@@ -2262,9 +2266,9 @@ HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
 
   unsigned OffsetReg = Hexagon::R28;
 
-  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                                  DAG.getRegister(Hexagon::R30, getPointerTy()),
-                                  DAG.getIntPtrConstant(4, dl));
+  SDValue StoreAddr =
+      DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getRegister(Hexagon::R30, PtrVT),
+                  DAG.getIntPtrConstant(4, dl));
   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
                        false, false, 0);
   Chain = DAG.getCopyToReg(Chain, dl, OffsetReg, Offset);
@@ -2338,8 +2342,7 @@ HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
 std::pair<unsigned, const TargetRegisterClass *>
 HexagonTargetLowering::getRegForInlineAsmConstraint(
-    const TargetRegisterInfo *TRI, const std::string &Constraint,
-    MVT VT) const {
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':   // R0-R31
@@ -2372,8 +2375,8 @@ bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 
 /// isLegalAddressingMode - Return true if the addressing mode represented by
 /// AM is legal for this target, for a load/store of the specified type.
-bool HexagonTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                  Type *Ty,
+bool HexagonTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                  const AddrMode &AM, Type *Ty,
                                                   unsigned AS) const {
   // Allows a signed-extended 11-bit immediate field.
   if (AM.BaseOffs <= -(1LL << 13) || AM.BaseOffs >= (1LL << 13)-1)
@@ -2463,3 +2466,45 @@ bool llvm::isPositiveHalfWord(SDNode *N) {
     return true;
   }
 }
+
+Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+      AtomicOrdering Ord) const {
+  BasicBlock *BB = Builder.GetInsertBlock();
+  Module *M = BB->getParent()->getParent();
+  Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
+  unsigned SZ = Ty->getPrimitiveSizeInBits();
+  assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported");
+  Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked
+                                   : Intrinsic::hexagon_L4_loadd_locked;
+  Value *Fn = Intrinsic::getDeclaration(M, IntID);
+  return Builder.CreateCall(Fn, Addr, "larx");
+}
+
+/// Perform a store-conditional operation to Addr. Return the status of the
+/// store. This should be 0 if the store succeeded, non-zero otherwise.
+Value *HexagonTargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+      Value *Val, Value *Addr, AtomicOrdering Ord) const {
+  BasicBlock *BB = Builder.GetInsertBlock();
+  Module *M = BB->getParent()->getParent();
+  Type *Ty = Val->getType();
+  unsigned SZ = Ty->getPrimitiveSizeInBits();
+  assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic stores supported");
+  Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_S2_storew_locked
+                                   : Intrinsic::hexagon_S4_stored_locked;
+  Value *Fn = Intrinsic::getDeclaration(M, IntID);
+  Value *Call = Builder.CreateCall(Fn, {Addr, Val}, "stcx");
+  Value *Cmp = Builder.CreateICmpEQ(Call, Builder.getInt32(0), "");
+  Value *Ext = Builder.CreateZExt(Cmp, Type::getInt32Ty(M->getContext()));
+  return Ext;
+}
+
+bool HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  // Do not expand loads and stores that don't exceed 64 bits.
+  return LI->getType()->getPrimitiveSizeInBits() > 64;
+}
+
+bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  // Do not expand loads and stores that don't exceed 64 bits.
+  return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64;
+}
+
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index b80e8477eb7b..2642abffaddd 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -165,7 +165,8 @@ bool isPositiveHalfWord(SDNode *N);
 
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    EVT getSetCCResultType(LLVMContext &C, EVT VT) const override {
+    EVT getSetCCResultType(const DataLayout &, LLVMContext &C,
+                           EVT VT) const override {
       if (!VT.isVector())
         return MVT::i1;
       else
@@ -179,11 +180,10 @@ bool isPositiveHalfWord(SDNode *N);
 
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
-    unsigned getInlineAsmMemConstraint(
-        const std::string &ConstraintCode) const override {
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "o")
         return InlineAsm::Constraint_o;
       else if (ConstraintCode == "v")
@@ -198,8 +198,8 @@ bool isPositiveHalfWord(SDNode *N);
     /// The type may be VoidTy, in which case only return true if the addressing
     /// mode is legal for a load/store of any legal type.
     /// TODO: Handle pre/postinc as well.
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                               unsigned AS) const override;
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
     bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
@@ -207,6 +207,21 @@ bool isPositiveHalfWord(SDNode *N);
     /// compare a register against the immediate without having to materialize
     /// the immediate into a register.
     bool isLegalICmpImmediate(int64_t Imm) const override;
+
+    // Handling of atomic RMW instructions.
+    bool hasLoadLinkedStoreConditional() const override {
+      return true;
+    }
+    Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+        AtomicOrdering Ord) const override;
+    Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+        Value *Addr, AtomicOrdering Ord) const override;
+    bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    AtomicRMWExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI)
+        const override {
+      return AtomicRMWExpansionKind::LLSC;
+    }
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 8f255a08f534..f6bb4a045438 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -221,7 +221,7 @@ unsigned HexagonRegisterInfo::getRARegister() const {
 
 unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction
                                                &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const HexagonFrameLowering *TFI = getFrameLowering(MF);
   if (TFI->hasFP(MF))
     return Hexagon::R30;
   return Hexagon::R29;
@@ -240,7 +240,8 @@ unsigned HexagonRegisterInfo::getStackRegister() const {
 
 bool
 HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
-  return MF.getSubtarget().getFrameLowering()->hasFP(MF);
+  const HexagonFrameLowering *TFI = getFrameLowering(MF);
+  return TFI->hasFP(MF);
 }
 
 
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index b5db997eb1b8..276cc69eed0f 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -18,12 +18,6 @@ using namespace llvm;
 
 bool llvm::flag_aligned_memcpy;
 
-HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-HexagonSelectionDAGInfo::~HexagonSelectionDAGInfo() {
-}
-
 SDValue
 HexagonSelectionDAGInfo::
 EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
index 8ac2e43f9294..80ac5d7bd9e2 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -20,8 +20,6 @@ namespace llvm {
 
 class HexagonSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit HexagonSelectionDAGInfo(const DataLayout &DL);
-  ~HexagonSelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index fe6c4f4298b5..cd482b3e3af1 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -74,7 +74,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
                                    StringRef FS, const TargetMachine &TM)
     : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      TSInfo(*TM.getDataLayout()), FrameLowering() {
+      FrameLowering() {
 
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index a173a8087832..b50442969a29 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -37,6 +37,18 @@ static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
   cl::init(true), cl::Hidden, cl::ZeroOrMore,
   cl::desc("Early expansion of MUX"));
 
+static cl::opt<bool> EnableGenInsert("hexagon-insert", cl::init(true),
+  cl::Hidden, cl::desc("Generate \"insert\" instructions"));
+
+static cl::opt<bool> EnableCommGEP("hexagon-commgep", cl::init(true),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Enable commoning of GEP instructions"));
+
+static cl::opt<bool> EnableGenExtract("hexagon-extract", cl::init(true),
+  cl::Hidden, cl::desc("Generate \"extract\" instructions"));
+
+static cl::opt<bool> EnableGenPred("hexagon-gen-pred", cl::init(true),
+  cl::Hidden, cl::desc("Enable conversion of arithmetic operations to "
+  "predicate instructions"));
 
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
@@ -60,23 +72,23 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
                     createVLIWMachineSched);
 
 namespace llvm {
-  FunctionPass *createHexagonExpandCondsets();
-  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
-                                     CodeGenOpt::Level OptLevel);
-  FunctionPass *createHexagonDelaySlotFillerPass(const TargetMachine &TM);
-  FunctionPass *createHexagonFPMoverPass(const TargetMachine &TM);
-  FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM);
   FunctionPass *createHexagonCFGOptimizer();
-
-  FunctionPass *createHexagonSplitConst32AndConst64();
+  FunctionPass *createHexagonCommonGEP();
+  FunctionPass *createHexagonCopyToCombine();
+  FunctionPass *createHexagonExpandCondsets();
   FunctionPass *createHexagonExpandPredSpillCode();
-  FunctionPass *createHexagonHardwareLoops();
-  FunctionPass *createHexagonPeephole();
   FunctionPass *createHexagonFixupHwLoops();
+  FunctionPass *createHexagonGenExtract();
+  FunctionPass *createHexagonGenInsert();
+  FunctionPass *createHexagonGenPredicate();
+  FunctionPass *createHexagonHardwareLoops();
+  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel);
   FunctionPass *createHexagonNewValueJump();
-  FunctionPass *createHexagonCopyToCombine();
   FunctionPass *createHexagonPacketizer();
-  FunctionPass *createHexagonNewValueJump();
+  FunctionPass *createHexagonPeephole();
+  FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonSplitConst32AndConst64();
 } // end namespace llvm;
 
 /// HexagonTargetMachine ctor - Create an ILP32 architecture model.
@@ -122,6 +134,7 @@ public:
     return createVLIWMachineSched(C);
   }
 
+  void addIRPasses() override;
   bool addInstSelector() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
@@ -134,6 +147,20 @@ TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new HexagonPassConfig(this, PM);
 }
 
+void HexagonPassConfig::addIRPasses() {
+  TargetPassConfig::addIRPasses();
+  bool NoOpt = (getOptLevel() == CodeGenOpt::None);
+
+  addPass(createAtomicExpandPass(TM));
+  if (!NoOpt) {
+    if (EnableCommGEP)
+      addPass(createHexagonCommonGEP());
+    // Replace certain combinations of shifts and ands with extracts.
+    if (EnableGenExtract)
+      addPass(createHexagonGenExtract());
+  }
+}
+
 bool HexagonPassConfig::addInstSelector() {
   HexagonTargetMachine &TM = getHexagonTargetMachine();
   bool NoOpt = (getOptLevel() == CodeGenOpt::None);
@@ -144,8 +171,13 @@ bool HexagonPassConfig::addInstSelector() {
   addPass(createHexagonISelDag(TM, getOptLevel()));
 
   if (!NoOpt) {
+    // Create logical operations on predicate registers.
+    if (EnableGenPred)
+      addPass(createHexagonGenPredicate(), false);
     addPass(createHexagonPeephole());
     printAndVerify("After hexagon peephole pass");
+    if (EnableGenInsert)
+      addPass(createHexagonGenInsert(), false);
   }
 
   return false;
diff --git a/lib/Target/Hexagon/LLVMBuild.txt b/lib/Target/Hexagon/LLVMBuild.txt
index 8259055b3f41..9d288af0214a 100644
--- a/lib/Target/Hexagon/LLVMBuild.txt
+++ b/lib/Target/Hexagon/LLVMBuild.txt
@@ -39,4 +39,5 @@ required_libraries =
  SelectionDAG
  Support
  Target
+ TransformUtils
 add_to_library_groups = Hexagon
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 83ce0abd835e..53305d85fd80 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -46,7 +46,7 @@ MCInstrInfo *llvm::createHexagonMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitHexagonMCRegisterInfo(X, Hexagon::R0);
   return X;
@@ -54,9 +54,7 @@ static MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitHexagonMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createHexagonMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
 namespace {
@@ -151,7 +149,8 @@ static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createHexagonMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createHexagonMCCodeGenInfo(const Triple &TT,
+                                                 Reloc::Model RM,
                                                  CodeModel::Model CM,
                                                  CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index a8f9b52746ad..3f377631c016 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -18,7 +18,6 @@ add_llvm_target(MSP430CodeGen
   MSP430RegisterInfo.cpp
   MSP430Subtarget.cpp
   MSP430TargetMachine.cpp
-  MSP430SelectionDAGInfo.cpp
   MSP430AsmPrinter.cpp
   MSP430MCInstLower.cpp
   )
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index be445c56389a..807d1129b5fc 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -37,7 +37,7 @@ static MCInstrInfo *createMSP430MCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createMSP430MCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createMSP430MCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitMSP430MCRegisterInfo(X, MSP430::PC);
   return X;
@@ -45,12 +45,11 @@ static MCRegisterInfo *createMSP430MCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createMSP430MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitMSP430MCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createMSP430MCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createMSP430MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createMSP430MCCodeGenInfo(const Triple &TT,
+                                                Reloc::Model RM,
                                                 CodeModel::Model CM,
                                                 CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 5ce5013d898c..8a01334ee2dd 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -254,10 +254,11 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
       AM.Base.Reg = CurDAG->getRegister(0, VT);
   }
 
-  Base  = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase) ?
-    CurDAG->getTargetFrameIndex(AM.Base.FrameIndex,
-                                getTargetLowering()->getPointerTy()) :
-    AM.Base.Reg;
+  Base = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase)
+             ? CurDAG->getTargetFrameIndex(
+                   AM.Base.FrameIndex,
+                   getTargetLowering()->getPointerTy(CurDAG->getDataLayout()))
+             : AM.Base.Reg;
 
   if (AM.GV)
     Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(N),
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index bc51741a836f..29bc8b33988a 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -213,7 +213,7 @@ SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 TargetLowering::ConstraintType
-MSP430TargetLowering::getConstraintType(const std::string &Constraint) const {
+MSP430TargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
@@ -227,8 +227,7 @@ MSP430TargetLowering::getConstraintType(const std::string &Constraint) const {
 
 std::pair<unsigned, const TargetRegisterClass *>
 MSP430TargetLowering::getRegForInlineAsmConstraint(
-    const TargetRegisterInfo *TRI, const std::string &Constraint,
-    MVT VT) const {
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC Constraint Letters
     switch (Constraint[0]) {
@@ -494,7 +493,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
       if (Flags.isByVal()) {
         int FI = MFI->CreateFixedObject(Flags.getByValSize(),
                                         VA.getLocMemOffset(), true);
-        InVal = DAG.getFrameIndex(FI, getPointerTy());
+        InVal = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
       } else {
         // Load the argument to a virtual register
         unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
@@ -592,10 +591,10 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, dl,
-                                                      getPointerTy(), true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -630,12 +629,11 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
       assert(VA.isMemLoc());
 
       if (!StackPtr.getNode())
-        StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SP, getPointerTy());
+        StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SP, PtrVT);
 
-      SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                                   StackPtr,
-                                   DAG.getIntPtrConstant(VA.getLocMemOffset(),
-                                                         dl));
+      SDValue PtrOff =
+          DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                      DAG.getIntPtrConstant(VA.getLocMemOffset(), dl));
 
       SDValue MemOp;
       ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -700,11 +698,8 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
-  Chain = DAG.getCALLSEQ_END(Chain,
-                             DAG.getConstant(NumBytes, dl, getPointerTy(),
-                                             true),
-                             DAG.getConstant(0, dl, getPointerTy(), true),
-                             InFlag, dl);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, dl, PtrVT, true),
+                             DAG.getConstant(0, dl, PtrVT, true), InFlag, dl);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
@@ -788,30 +783,31 @@ SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op,
                                                  SelectionDAG &DAG) const {
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Create the TargetGlobalAddress node, folding in the constant offset.
-  SDValue Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
-                                              getPointerTy(), Offset);
-  return DAG.getNode(MSP430ISD::Wrapper, SDLoc(Op),
-                     getPointerTy(), Result);
+  SDValue Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), PtrVT, Offset);
+  return DAG.getNode(MSP430ISD::Wrapper, SDLoc(Op), PtrVT, Result);
 }
 
 SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc dl(Op);
   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
-  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT);
 
-  return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);
+  return DAG.getNode(MSP430ISD::Wrapper, dl, PtrVT, Result);
 }
 
 SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT);
 
-  return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);
+  return DAG.getNode(MSP430ISD::Wrapper, dl, PtrVT, Result);
 }
 
 static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, SDValue &TargetCC,
@@ -1024,16 +1020,17 @@ MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
   int ReturnAddrIndex = FuncInfo->getRAIndex();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
 
   if (ReturnAddrIndex == 0) {
     // Set up a frame object for the return address.
-    uint64_t SlotSize = getDataLayout()->getPointerSize();
+    uint64_t SlotSize = MF.getDataLayout().getPointerSize();
     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
                                                            true);
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
 
-  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+  return DAG.getFrameIndex(ReturnAddrIndex, PtrVT);
 }
 
 SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
@@ -1046,21 +1043,21 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset =
-        DAG.getConstant(getDataLayout()->getPointerSize(), dl, MVT::i16);
-    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                                   FrameAddr, Offset),
+        DAG.getConstant(DAG.getDataLayout().getPointerSize(), dl, MVT::i16);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
                        MachinePointerInfo(), false, false, false, 0);
   }
 
   // Just load the return address.
   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
-  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+                     MachinePointerInfo(), false, false, false, 0);
 }
 
 SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
@@ -1084,10 +1081,11 @@ SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Frame index of first vararg argument
-  SDValue FrameIndex = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
-                                         getPointerTy());
+  SDValue FrameIndex =
+      DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
 
   // Create a store of the frame index to the location operand
@@ -1099,9 +1097,9 @@ SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
 SDValue MSP430TargetLowering::LowerJumpTable(SDValue Op,
                                              SelectionDAG &DAG) const {
     JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-    SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
-    return DAG.getNode(MSP430ISD::Wrapper, SDLoc(JT),
-                       getPointerTy(), Result);
+    auto PtrVT = getPointerTy(DAG.getDataLayout());
+    SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+    return DAG.getNode(MSP430ISD::Wrapper, SDLoc(JT), PtrVT, Result);
 }
 
 /// getPostIndexedAddressParts - returns true by value, base pointer and
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 80d3ae175fb1..2d63852c185b 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -72,7 +72,9 @@ namespace llvm {
     explicit MSP430TargetLowering(const TargetMachine &TM,
                                   const MSP430Subtarget &STI);
 
-    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; }
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i8;
+    }
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -96,11 +98,10 @@ namespace llvm {
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
 
     TargetLowering::ConstraintType
-    getConstraintType(const std::string &Constraint) const override;
+    getConstraintType(StringRef Constraint) const override;
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     /// isTruncateFree - Return true if it's free to truncate a value of type
     /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 614467bcd248..2fb82e535e8d 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -37,7 +37,7 @@ MSP430RegisterInfo::MSP430RegisterInfo()
 
 const MCPhysReg*
 MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  const MSP430FrameLowering *TFI = getFrameLowering(*MF);
   const Function* F = MF->getFunction();
   static const MCPhysReg CalleeSavedRegs[] = {
     MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7,
@@ -73,7 +73,7 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const MSP430FrameLowering *TFI = getFrameLowering(MF);
 
   // Mark 4 special registers with subregisters as reserved.
   Reserved.set(MSP430::PCB);
@@ -109,7 +109,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const MSP430FrameLowering *TFI = getFrameLowering(MF);
   DebugLoc dl = MI.getDebugLoc();
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
@@ -156,7 +156,6 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 }
 
 unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
-
+  const MSP430FrameLowering *TFI = getFrameLowering(MF);
   return TFI->hasFP(MF) ? MSP430::FP : MSP430::SP;
 }
diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
deleted file mode 100644
index 3897ef684d4d..000000000000
--- a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- MSP430SelectionDAGInfo.cpp - MSP430 SelectionDAG Info -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the MSP430SelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MSP430TargetMachine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "msp430-selectiondag-info"
-
-MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-MSP430SelectionDAGInfo::~MSP430SelectionDAGInfo() {
-}
diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.h b/lib/Target/MSP430/MSP430SelectionDAGInfo.h
deleted file mode 100644
index 61a6b19111db..000000000000
--- a/lib/Target/MSP430/MSP430SelectionDAGInfo.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- MSP430SelectionDAGInfo.h - MSP430 SelectionDAG Info -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the MSP430 subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_MSP430_MSP430SELECTIONDAGINFO_H
-#define LLVM_LIB_TARGET_MSP430_MSP430SELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class MSP430TargetMachine;
-
-class MSP430SelectionDAGInfo : public TargetSelectionDAGInfo {
-public:
-  explicit MSP430SelectionDAGInfo(const DataLayout &DL);
-  ~MSP430SelectionDAGInfo();
-};
-
-}
-
-#endif
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index 6374f41c00ea..6216348e4d71 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -34,5 +34,4 @@ MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
 MSP430Subtarget::MSP430Subtarget(const Triple &TT, const std::string &CPU,
                                  const std::string &FS, const TargetMachine &TM)
     : MSP430GenSubtargetInfo(TT, CPU, FS), FrameLowering(),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      TSInfo(*TM.getDataLayout()) {}
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {}
diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
index 81f6f027d45c..ff2656d26dd2 100644
--- a/lib/Target/MSP430/MSP430Subtarget.h
+++ b/lib/Target/MSP430/MSP430Subtarget.h
@@ -18,8 +18,8 @@
 #include "MSP430ISelLowering.h"
 #include "MSP430InstrInfo.h"
 #include "MSP430RegisterInfo.h"
-#include "MSP430SelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -35,7 +35,7 @@ class MSP430Subtarget : public MSP430GenSubtargetInfo {
   MSP430FrameLowering FrameLowering;
   MSP430InstrInfo InstrInfo;
   MSP430TargetLowering TLInfo;
-  MSP430SelectionDAGInfo TSInfo;
+  TargetSelectionDAGInfo TSInfo;
 
 public:
   /// This constructor initializes the data members to match that
@@ -60,7 +60,7 @@ public:
   const MSP430TargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const MSP430SelectionDAGInfo *getSelectionDAGInfo() const override {
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 };
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index f14156dbfa2b..5107d2ae58c3 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -1727,37 +1727,59 @@ bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
 }
 
 namespace {
-template <unsigned ShiftAmount>
+void emitRX(unsigned Opcode, unsigned DstReg, MCOperand Imm, SMLoc IDLoc,
+            SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  tmpInst.setOpcode(Opcode);
+  tmpInst.addOperand(MCOperand::createReg(DstReg));
+  tmpInst.addOperand(Imm);
+  tmpInst.setLoc(IDLoc);
+  Instructions.push_back(tmpInst);
+}
+
+void emitRI(unsigned Opcode, unsigned DstReg, int16_t Imm, SMLoc IDLoc,
+            SmallVectorImpl<MCInst> &Instructions) {
+  emitRX(Opcode, DstReg, MCOperand::createImm(Imm), IDLoc, Instructions);
+}
+
+
+void emitRRX(unsigned Opcode, unsigned DstReg, unsigned SrcReg, MCOperand Imm,
+             SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  tmpInst.setOpcode(Opcode);
+  tmpInst.addOperand(MCOperand::createReg(DstReg));
+  tmpInst.addOperand(MCOperand::createReg(SrcReg));
+  tmpInst.addOperand(Imm);
+  tmpInst.setLoc(IDLoc);
+  Instructions.push_back(tmpInst);
+}
+
+void emitRRR(unsigned Opcode, unsigned DstReg, unsigned SrcReg,
+             unsigned SrcReg2, SMLoc IDLoc,
+             SmallVectorImpl<MCInst> &Instructions) {
+  emitRRX(Opcode, DstReg, SrcReg, MCOperand::createReg(SrcReg2), IDLoc,
+          Instructions);
+}
+
+void emitRRI(unsigned Opcode, unsigned DstReg, unsigned SrcReg, int16_t Imm,
+             SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+  emitRRX(Opcode, DstReg, SrcReg, MCOperand::createImm(Imm), IDLoc,
+          Instructions);
+}
+
+template <int16_t ShiftAmount>
 void createLShiftOri(MCOperand Operand, unsigned RegNo, SMLoc IDLoc,
                      SmallVectorImpl<MCInst> &Instructions) {
-  MCInst tmpInst;
-  if (ShiftAmount >= 32) {
-    tmpInst.setOpcode(Mips::DSLL32);
-    tmpInst.addOperand(MCOperand::createReg(RegNo));
-    tmpInst.addOperand(MCOperand::createReg(RegNo));
-    tmpInst.addOperand(MCOperand::createImm(ShiftAmount - 32));
-    tmpInst.setLoc(IDLoc);
-    Instructions.push_back(tmpInst);
-    tmpInst.clear();
-  } else if (ShiftAmount > 0) {
-    tmpInst.setOpcode(Mips::DSLL);
-    tmpInst.addOperand(MCOperand::createReg(RegNo));
-    tmpInst.addOperand(MCOperand::createReg(RegNo));
-    tmpInst.addOperand(MCOperand::createImm(ShiftAmount));
-    tmpInst.setLoc(IDLoc);
-    Instructions.push_back(tmpInst);
-    tmpInst.clear();
-  }
+  if (ShiftAmount >= 32)
+    emitRRI(Mips::DSLL32, RegNo, RegNo, ShiftAmount - 32, IDLoc, Instructions);
+  else if (ShiftAmount > 0)
+    emitRRI(Mips::DSLL, RegNo, RegNo, ShiftAmount, IDLoc, Instructions);
+
   // There's no need for an ORi if the immediate is 0.
   if (Operand.isImm() && Operand.getImm() == 0)
     return;
 
-  tmpInst.setOpcode(Mips::ORi);
-  tmpInst.addOperand(MCOperand::createReg(RegNo));
-  tmpInst.addOperand(MCOperand::createReg(RegNo));
-  tmpInst.addOperand(Operand);
-  tmpInst.setLoc(IDLoc);
-  Instructions.push_back(tmpInst);
+  emitRRX(Mips::ORi, RegNo, RegNo, Operand, IDLoc, Instructions);
 }
 
 template <unsigned ShiftAmount>
@@ -1818,12 +1840,22 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     return true;
   }
 
+  if (Is32BitImm) {
+    if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) {
+      // Sign extend up to 64-bit so that the predicates match the hardware
+      // behaviour. In particular, isInt<16>(0xffff8000) and similar should be
+      // true.
+      ImmValue = SignExtend64<32>(ImmValue);
+    } else {
+      Error(IDLoc, "instruction requires a 32-bit immediate");
+      return true;
+    }
+  }
+
   bool UseSrcReg = false;
   if (SrcReg != Mips::NoRegister)
     UseSrcReg = true;
 
-  MCInst tmpInst;
-
   unsigned TmpReg = DstReg;
   if (UseSrcReg && (DstReg == SrcReg)) {
     // At this point we need AT to perform the expansions and we exit if it is
@@ -1834,29 +1866,26 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     TmpReg = ATReg;
   }
 
-  tmpInst.setLoc(IDLoc);
   // FIXME: gas has a special case for values that are 000...1111, which
   // becomes a li -1 and then a dsrl
-  if (0 <= ImmValue && ImmValue <= 65535) {
-    // For unsigned and positive signed 16-bit values (0 <= j <= 65535):
-    // li d,j => ori d,$zero,j
-    if (!UseSrcReg)
-      SrcReg = isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
-    tmpInst.setOpcode(Mips::ORi);
-    tmpInst.addOperand(MCOperand::createReg(DstReg));
-    tmpInst.addOperand(MCOperand::createReg(SrcReg));
-    tmpInst.addOperand(MCOperand::createImm(ImmValue));
-    Instructions.push_back(tmpInst);
-  } else if (ImmValue < 0 && ImmValue >= -32768) {
-    // For negative signed 16-bit values (-32768 <= j < 0):
+  if (isInt<16>(ImmValue)) {
     // li d,j => addiu d,$zero,j
     if (!UseSrcReg)
       SrcReg = Mips::ZERO;
-    tmpInst.setOpcode(Mips::ADDiu);
-    tmpInst.addOperand(MCOperand::createReg(DstReg));
-    tmpInst.addOperand(MCOperand::createReg(SrcReg));
-    tmpInst.addOperand(MCOperand::createImm(ImmValue));
-    Instructions.push_back(tmpInst);
+    emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions);
+  } else if (isUInt<16>(ImmValue)) {
+    // li d,j => ori d,$zero,j
+    unsigned TmpReg = DstReg;
+    if (SrcReg == DstReg) {
+      unsigned ATReg = getATReg(IDLoc);
+      if (!ATReg)
+        return true;
+      TmpReg = ATReg;
+    }
+
+    emitRRI(Mips::ORi, TmpReg, Mips::ZERO, ImmValue, IDLoc, Instructions);
+    if (UseSrcReg)
+      emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
   } else if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) {
     warnIfNoMacro(IDLoc);
 
@@ -1869,30 +1898,16 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     if (!Is32BitImm && !isInt<32>(ImmValue)) {
       // For DLI, expand to an ORi instead of a LUi to avoid sign-extending the
       // upper 32 bits.
-      tmpInst.setOpcode(Mips::ORi);
-      tmpInst.addOperand(MCOperand::createReg(TmpReg));
-      tmpInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      tmpInst.addOperand(MCOperand::createImm(Bits31To16));
-      tmpInst.setLoc(IDLoc);
-      Instructions.push_back(tmpInst);
-      // Move the value to the upper 16 bits by doing a 16-bit left shift.
-      createLShiftOri<16>(0, TmpReg, IDLoc, Instructions);
-    } else {
-      tmpInst.setOpcode(Mips::LUi);
-      tmpInst.addOperand(MCOperand::createReg(TmpReg));
-      tmpInst.addOperand(MCOperand::createImm(Bits31To16));
-      Instructions.push_back(tmpInst);
-    }
+      emitRRI(Mips::ORi, TmpReg, Mips::ZERO, Bits31To16, IDLoc, Instructions);
+      emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, Instructions);
+    } else
+      emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, Instructions);
     createLShiftOri<0>(Bits15To0, TmpReg, IDLoc, Instructions);
 
     if (UseSrcReg)
       createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions);
 
   } else if ((ImmValue & (0xffffLL << 48)) == 0) {
-    if (Is32BitImm) {
-      Error(IDLoc, "instruction requires a 32-bit immediate");
-      return true;
-    }
     warnIfNoMacro(IDLoc);
 
     //            <-------  lo32 ------>
@@ -1912,10 +1927,7 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
     uint16_t Bits15To0 = ImmValue & 0xffff;
 
-    tmpInst.setOpcode(Mips::LUi);
-    tmpInst.addOperand(MCOperand::createReg(TmpReg));
-    tmpInst.addOperand(MCOperand::createImm(Bits47To32));
-    Instructions.push_back(tmpInst);
+    emitRI(Mips::LUi, TmpReg, Bits47To32, IDLoc, Instructions);
     createLShiftOri<0>(Bits31To16, TmpReg, IDLoc, Instructions);
     createLShiftOri<16>(Bits15To0, TmpReg, IDLoc, Instructions);
 
@@ -1923,10 +1935,6 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
       createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions);
 
   } else {
-    if (Is32BitImm) {
-      Error(IDLoc, "instruction requires a 32-bit immediate");
-      return true;
-    }
     warnIfNoMacro(IDLoc);
 
     // <-------  hi32 ------> <-------  lo32 ------>
@@ -1948,10 +1956,7 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
     uint16_t Bits15To0 = ImmValue & 0xffff;
 
-    tmpInst.setOpcode(Mips::LUi);
-    tmpInst.addOperand(MCOperand::createReg(TmpReg));
-    tmpInst.addOperand(MCOperand::createImm(Bits63To48));
-    Instructions.push_back(tmpInst);
+    emitRI(Mips::LUi, TmpReg, Bits63To48, IDLoc, Instructions);
     createLShiftOri<0>(Bits47To32, TmpReg, IDLoc, Instructions);
 
     // When Bits31To16 is 0, do a left shift of 32 bits instead of doing
@@ -2096,8 +2101,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(
     tmpInst.addOperand(MCOperand::createExpr(HiExpr));
     Instructions.push_back(tmpInst);
 
-    createLShiftOri<0>(MCOperand::createExpr(LoExpr), TmpReg, SMLoc(),
-                       Instructions);
+    emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), SMLoc(),
+            Instructions);
   }
 
   if (UseSrcReg)
@@ -2708,12 +2713,8 @@ void MipsAsmParser::createNop(bool hasShortDelaySlot, SMLoc IDLoc,
 void MipsAsmParser::createAddu(unsigned DstReg, unsigned SrcReg,
                                unsigned TrgReg, bool Is64Bit,
                                SmallVectorImpl<MCInst> &Instructions) {
-  MCInst AdduInst;
-  AdduInst.setOpcode(Is64Bit ? Mips::DADDu : Mips::ADDu);
-  AdduInst.addOperand(MCOperand::createReg(DstReg));
-  AdduInst.addOperand(MCOperand::createReg(SrcReg));
-  AdduInst.addOperand(MCOperand::createReg(TrgReg));
-  Instructions.push_back(AdduInst);
+  emitRRR(Is64Bit ? Mips::DADDu : Mips::ADDu, DstReg, SrcReg, TrgReg, SMLoc(),
+          Instructions);
 }
 
 unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 36ba8e559e0b..bde843afd3d2 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -46,7 +46,6 @@ add_llvm_target(MipsCodeGen
   MipsSubtarget.cpp
   MipsTargetMachine.cpp
   MipsTargetObjectFile.cpp
-  MipsSelectionDAGInfo.cpp
   )
 
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 9bdf8235a2b4..949ee1474f96 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -59,7 +59,7 @@ static MCInstrInfo *createMipsMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createMipsMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitMipsMCRegisterInfo(X, Mips::RA);
   return X;
@@ -68,9 +68,7 @@ static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) {
 static MCSubtargetInfo *createMipsMCSubtargetInfo(const Triple &TT,
                                                   StringRef CPU, StringRef FS) {
   CPU = MIPS_MC::selectMipsCPU(TT, CPU);
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitMipsMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createMipsMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
 static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI,
@@ -84,7 +82,7 @@ static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createMipsMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createMipsMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
                                               CodeModel::Model CM,
                                               CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index db2a924a99f9..46cc99c62393 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -152,18 +152,19 @@ Mips16FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   return isInt<15>(MFI->getMaxCallFrameSize()) && !MFI->hasVarSizedObjects();
 }
 
-void Mips16FrameLowering::
-processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS) const {
+void Mips16FrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                               BitVector &SavedRegs,
+                                               RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   const Mips16InstrInfo &TII =
       *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
   const MipsRegisterInfo &RI = TII.getRegisterInfo();
   const BitVector Reserved = RI.getReservedRegs(MF);
   bool SaveS2 = Reserved[Mips::S2];
   if (SaveS2)
-    MF.getRegInfo().setPhysRegUsed(Mips::S2);
+    SavedRegs.set(Mips::S2);
   if (hasFP(MF))
-    MF.getRegInfo().setPhysRegUsed(Mips::S0);
+    SavedRegs.set(Mips::S0);
 }
 
 const MipsFrameLowering *
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index f281c927c1c4..b48ed4641ea7 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -38,8 +38,8 @@ public:
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 7b6a2a154471..bce2c1eb4485 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -120,13 +120,13 @@ void Mips16DAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
 SDValue Mips16DAGToDAGISel::getMips16SPAliasReg() {
   unsigned Mips16SPAliasReg =
     MF->getInfo<MipsFunctionInfo>()->getMips16SPAliasReg();
-  return CurDAG->getRegister(Mips16SPAliasReg,
-                             getTargetLowering()->getPointerTy());
+  auto PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
+  return CurDAG->getRegister(Mips16SPAliasReg, PtrVT);
 }
 
 void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
-  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0,
-                                           getTargetLowering()->getPointerTy());
+  auto PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
+  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, PtrVT);
   if (Parent) {
     switch (Parent->getOpcode()) {
       case ISD::LOAD: {
@@ -155,7 +155,7 @@ void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
       }
     }
   }
-  AliasReg = CurDAG->getRegister(Mips::SP, getTargetLowering()->getPointerTy());
+  AliasReg = CurDAG->getRegister(Mips::SP, PtrVT);
   return;
 
 }
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 846e3c964f44..3522cbb1f36a 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -502,7 +502,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
     unsigned V0Reg = Mips::V0;
     if (NeedMips16Helper) {
       RegsToPass.push_front(std::make_pair(V0Reg, Callee));
-      JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction, getPointerTy());
+      JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction,
+                                         getPointerTy(DAG.getDataLayout()));
       ExternalSymbolSDNode *S = cast<ExternalSymbolSDNode>(JumpTarget);
       JumpTarget = getAddrGlobal(S, CLI.DL, JumpTarget.getValueType(), DAG,
                                  MipsII::MO_GOT, Chain,
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index c2651b82d285..e2f6fcc17726 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -267,7 +267,7 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
 }
 
 unsigned MipsFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
-  assert(TLI.getValueType(AI->getType(), true) == MVT::i32 &&
+  assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i32 &&
          "Alloca should always return a pointer.");
 
   DenseMap<const AllocaInst *, int>::iterator SI =
@@ -382,7 +382,7 @@ unsigned MipsFastISel::materializeExternalCallSym(MCSymbol *Sym) {
 // Materialize a constant into a register, and return the register
 // number (or zero if we failed to handle it).
 unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
 
   // Only handle simple types.
   if (!CEVT.isSimple())
@@ -507,12 +507,13 @@ bool MipsFastISel::computeCallAddress(const Value *V, Address &Addr) {
     break;
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs if its operand is in the same BB.
-    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
       return computeCallAddress(U->getOperand(0), Addr);
     break;
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints if its operand is in the same BB.
-    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return computeCallAddress(U->getOperand(0), Addr);
     break;
   }
@@ -532,7 +533,7 @@ bool MipsFastISel::computeCallAddress(const Value *V, Address &Addr) {
 }
 
 bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) {
-  EVT evt = TLI.getValueType(Ty, true);
+  EVT evt = TLI.getValueType(DL, Ty, true);
   // Only handle simple types.
   if (evt == MVT::Other || !evt.isSimple())
     return false;
@@ -931,8 +932,8 @@ bool MipsFastISel::selectFPExt(const Instruction *I) {
   if (UnsupportedFPMode)
     return false;
   Value *Src = I->getOperand(0);
-  EVT SrcVT = TLI.getValueType(Src->getType(), true);
-  EVT DestVT = TLI.getValueType(I->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::f32 || DestVT != MVT::f64)
     return false;
@@ -998,8 +999,8 @@ bool MipsFastISel::selectFPTrunc(const Instruction *I) {
   if (UnsupportedFPMode)
     return false;
   Value *Src = I->getOperand(0);
-  EVT SrcVT = TLI.getValueType(Src->getType(), true);
-  EVT DestVT = TLI.getValueType(I->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::f64 || DestVT != MVT::f32)
     return false;
@@ -1415,7 +1416,8 @@ bool MipsFastISel::selectRet(const Instruction *I) {
   if (Ret->getNumOperands() > 0) {
     CallingConv::ID CC = F.getCallingConv();
     SmallVector<ISD::OutputArg, 4> Outs;
-    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
     MipsCCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs,
@@ -1449,7 +1451,7 @@ bool MipsFastISel::selectRet(const Instruction *I) {
     if (!MRI.getRegClass(SrcReg)->contains(DestReg))
       return false;
 
-    EVT RVEVT = TLI.getValueType(RV->getType());
+    EVT RVEVT = TLI.getValueType(DL, RV->getType());
     if (!RVEVT.isSimple())
       return false;
 
@@ -1493,8 +1495,8 @@ bool MipsFastISel::selectTrunc(const Instruction *I) {
   Value *Op = I->getOperand(0);
 
   EVT SrcVT, DestVT;
-  SrcVT = TLI.getValueType(Op->getType(), true);
-  DestVT = TLI.getValueType(I->getType(), true);
+  SrcVT = TLI.getValueType(DL, Op->getType(), true);
+  DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
     return false;
@@ -1521,8 +1523,8 @@ bool MipsFastISel::selectIntExt(const Instruction *I) {
     return false;
 
   EVT SrcEVT, DestEVT;
-  SrcEVT = TLI.getValueType(SrcTy, true);
-  DestEVT = TLI.getValueType(DestTy, true);
+  SrcEVT = TLI.getValueType(DL, SrcTy, true);
+  DestEVT = TLI.getValueType(DL, DestTy, true);
   if (!SrcEVT.isSimple())
     return false;
   if (!DestEVT.isSimple())
@@ -1620,7 +1622,7 @@ unsigned MipsFastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
 }
 
 bool MipsFastISel::selectDivRem(const Instruction *I, unsigned ISDOpcode) {
-  EVT DestEVT = TLI.getValueType(I->getType(), true);
+  EVT DestEVT = TLI.getValueType(DL, I->getType(), true);
   if (!DestEVT.isSimple())
     return false;
 
@@ -1685,7 +1687,7 @@ bool MipsFastISel::selectShift(const Instruction *I) {
     if (!TempReg)
       return false;
 
-    MVT Op0MVT = TLI.getValueType(Op0->getType(), true).getSimpleVT();
+    MVT Op0MVT = TLI.getValueType(DL, Op0->getType(), true).getSimpleVT();
     bool IsZExt = Opcode == Instruction::LShr;
     if (!emitIntExt(Op0MVT, Op0Reg, MVT::i32, TempReg, IsZExt))
       return false;
@@ -1803,7 +1805,7 @@ unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V,
   unsigned VReg = getRegForValue(V);
   if (VReg == 0)
     return 0;
-  MVT VMVT = TLI.getValueType(V->getType(), true).getSimpleVT();
+  MVT VMVT = TLI.getValueType(DL, V->getType(), true).getSimpleVT();
   if ((VMVT == MVT::i8) || (VMVT == MVT::i16)) {
     unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
     if (!emitIntExt(VMVT, VReg, MVT::i32, TempReg, IsUnsigned))
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 2c9868ac051d..06502397b6b8 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -59,8 +59,9 @@ bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
 /// GOT address into a register.
 SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
-  return CurDAG->getRegister(GlobalBaseReg,
-                             getTargetLowering()->getPointerTy()).getNode();
+  return CurDAG->getRegister(GlobalBaseReg, getTargetLowering()->getPointerTy(
+                                                CurDAG->getDataLayout()))
+      .getNode();
 }
 
 /// ComplexPattern used on MipsInstrInfo
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 67ddcc4dacb9..fbebb9abb4cc 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -466,7 +466,8 @@ MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
   return Mips::createFastISel(funcInfo, libInfo);
 }
 
-EVT MipsTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT MipsTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+                                           EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
@@ -1579,9 +1580,10 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Table = Op.getOperand(1);
   SDValue Index = Op.getOperand(2);
   SDLoc DL(Op);
-  EVT PTy = getPointerTy();
+  auto &TD = DAG.getDataLayout();
+  EVT PTy = getPointerTy(TD);
   unsigned EntrySize =
-    DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(*getDataLayout());
+      DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
 
   Index = DAG.getNode(ISD::MUL, DL, PTy, Index,
                       DAG.getConstant(EntrySize, DL, PTy));
@@ -1647,10 +1649,10 @@ lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 {
   SDLoc DL(Op);
   EVT Ty = Op.getOperand(0).getValueType();
-  SDValue Cond = DAG.getNode(ISD::SETCC, DL,
-                             getSetCCResultType(*DAG.getContext(), Ty),
-                             Op.getOperand(0), Op.getOperand(1),
-                             Op.getOperand(4));
+  SDValue Cond =
+      DAG.getNode(ISD::SETCC, DL, getSetCCResultType(DAG.getDataLayout(),
+                                                     *DAG.getContext(), Ty),
+                  Op.getOperand(0), Op.getOperand(1), Op.getOperand(4));
 
   return DAG.getNode(ISD::SELECT, DL, Op.getValueType(), Cond, Op.getOperand(2),
                      Op.getOperand(3));
@@ -1723,7 +1725,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   SDLoc DL(GA);
   const GlobalValue *GV = GA->getGlobal();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   TLSModel::Model model = getTargetMachine().getTLSModel(GV);
 
@@ -1831,7 +1833,7 @@ SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 
   SDLoc DL(Op);
   SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
-                                 getPointerTy());
+                                 getPointerTy(MF.getDataLayout()));
 
   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
@@ -1850,9 +1852,9 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Node);
   unsigned ArgSlotSizeInBytes = (ABI.IsN32() || ABI.IsN64()) ? 8 : 4;
 
-  SDValue VAListLoad = DAG.getLoad(getPointerTy(), DL, Chain, VAListPtr,
-                                   MachinePointerInfo(SV), false, false, false,
-                                   0);
+  SDValue VAListLoad =
+      DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL, Chain, VAListPtr,
+                  MachinePointerInfo(SV), false, false, false, 0);
   SDValue VAList = VAListLoad;
 
   // Re-align the pointer if necessary.
@@ -1874,7 +1876,9 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Increment the pointer, VAList, to the next vaarg.
-  unsigned ArgSizeInBytes = getDataLayout()->getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext()));
+  auto &TD = DAG.getDataLayout();
+  unsigned ArgSizeInBytes =
+      TD.getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext()));
   SDValue Tmp3 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
                              DAG.getConstant(RoundUpToAlignment(ArgSizeInBytes,
                                                             ArgSlotSizeInBytes),
@@ -2062,7 +2066,7 @@ SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
   Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1));
   return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain,
                      DAG.getRegister(OffsetReg, Ty),
-                     DAG.getRegister(AddrReg, getPointerTy()),
+                     DAG.getRegister(AddrReg, getPointerTy(MF.getDataLayout())),
                      Chain.getValue(1));
 }
 
@@ -2479,15 +2483,16 @@ MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
                                    SDValue Chain, SDValue Arg, SDLoc DL,
                                    bool IsTailCall, SelectionDAG &DAG) const {
   if (!IsTailCall) {
-    SDValue PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
-                                 DAG.getIntPtrConstant(Offset, DL));
+    SDValue PtrOff =
+        DAG.getNode(ISD::ADD, DL, getPointerTy(DAG.getDataLayout()), StackPtr,
+                    DAG.getIntPtrConstant(Offset, DL));
     return DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo(), false,
                         false, 0);
   }
 
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   int FI = MFI->CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false);
-  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(),
                       /*isVolatile=*/ true, false, 0);
 }
@@ -2611,8 +2616,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (!IsTailCall)
     Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
 
-  SDValue StackPtr = DAG.getCopyFromReg(
-      Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP, getPointerTy());
+  SDValue StackPtr =
+      DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP,
+                         getPointerTy(DAG.getDataLayout()));
 
   // With EABI is it possible to have 16 args on registers.
   std::deque< std::pair<unsigned, SDValue> > RegsToPass;
@@ -2750,7 +2756,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         IsCallReloc = true;
       }
     } else
-      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL,
+                                          getPointerTy(DAG.getDataLayout()), 0,
                                           MipsII::MO_NO_FLAG);
     GlobalOrExternal = true;
   }
@@ -2758,8 +2765,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     const char *Sym = S->getSymbol();
 
     if (!ABI.IsN64() && !IsPIC) // !N64 && static
-      Callee =
-          DAG.getTargetExternalSymbol(Sym, getPointerTy(), MipsII::MO_NO_FLAG);
+      Callee = DAG.getTargetExternalSymbol(
+          Sym, getPointerTy(DAG.getDataLayout()), MipsII::MO_NO_FLAG);
     else if (LargeGOT) {
       Callee = getAddrGlobalLargeGOT(S, DL, Ty, DAG, MipsII::MO_CALL_HI16,
                                      MipsII::MO_CALL_LO16, Chain,
@@ -3029,7 +3036,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
                                       VA.getLocMemOffset(), true);
 
       // Create load nodes to retrieve arguments from the stack
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
       SDValue ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
                                      MachinePointerInfo::getFixedStack(FI),
                                      false, false, false, 0);
@@ -3174,12 +3181,13 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 
     if (!Reg)
       llvm_unreachable("sret virtual register not created in the entry block");
-    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(DAG.getDataLayout()));
     unsigned V0 = ABI.IsN64() ? Mips::V0_64 : Mips::V0;
 
     Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag);
     Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(V0, getPointerTy()));
+    RetOps.push_back(DAG.getRegister(V0, getPointerTy(DAG.getDataLayout())));
   }
 
   RetOps[0] = Chain;  // Update chain.
@@ -3198,9 +3206,8 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
-MipsTargetLowering::ConstraintType MipsTargetLowering::
-getConstraintType(const std::string &Constraint) const
-{
+MipsTargetLowering::ConstraintType
+MipsTargetLowering::getConstraintType(StringRef Constraint) const {
   // Mips specific constraints
   // GCC config/mips/constraints.md
   //
@@ -3290,9 +3297,8 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
 /// into non-numeric and numeric parts (Prefix and Reg). The first boolean flag
 /// that is returned indicates whether parsing was successful. The second flag
 /// is true if the numeric part exists.
-static std::pair<bool, bool>
-parsePhysicalReg(StringRef C, std::string &Prefix,
-                 unsigned long long &Reg) {
+static std::pair<bool, bool> parsePhysicalReg(StringRef C, StringRef &Prefix,
+                                              unsigned long long &Reg) {
   if (C.front() != '{' || C.back() != '}')
     return std::make_pair(false, false);
 
@@ -3300,7 +3306,7 @@ parsePhysicalReg(StringRef C, std::string &Prefix,
   StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1;
   I = std::find_if(B, E, std::ptr_fun(isdigit));
 
-  Prefix.assign(B, I - B);
+  Prefix = StringRef(B, I - B);
 
   // The second flag is set to false if no numeric characters were found.
   if (I == E)
@@ -3316,7 +3322,7 @@ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
   const TargetRegisterInfo *TRI =
       Subtarget.getRegisterInfo();
   const TargetRegisterClass *RC;
-  std::string Prefix;
+  StringRef Prefix;
   unsigned long long Reg;
 
   std::pair<bool, bool> R = parsePhysicalReg(C, Prefix, Reg);
@@ -3332,7 +3338,7 @@ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
     RC = TRI->getRegClass(Prefix == "hi" ?
                           Mips::HI32RegClassID : Mips::LO32RegClassID);
     return std::make_pair(*(RC->begin()), RC);
-  } else if (Prefix.compare(0, 4, "$msa") == 0) {
+  } else if (Prefix.startswith("$msa")) {
     // Parse $msa(ir|csr|access|save|modify|request|map|unmap)
 
     // No numeric characters follow the name.
@@ -3390,7 +3396,7 @@ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
 /// pointer.
 std::pair<unsigned, const TargetRegisterClass *>
 MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                 const std::string &Constraint,
+                                                 StringRef Constraint,
                                                  MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
@@ -3546,8 +3552,8 @@ void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
-bool MipsTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                               Type *Ty,
+bool MipsTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                               const AddrMode &AM, Type *Ty,
                                                unsigned AS) const {
   // No global is ever allowed as a base.
   if (AM.BaseGV)
@@ -3625,7 +3631,7 @@ void MipsTargetLowering::copyByValRegs(
     FrameObjOffset = VA.getLocMemOffset();
 
   // Create frame object.
-  EVT PtrTy = getPointerTy();
+  EVT PtrTy = getPointerTy(DAG.getDataLayout());
   int FI = MFI->CreateFixedObject(FrameObjSize, FrameObjOffset, true);
   SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
   InVals.push_back(FIN);
@@ -3662,7 +3668,8 @@ void MipsTargetLowering::passByValArg(
   unsigned OffsetInBytes = 0; // From beginning of struct
   unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
   unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes);
-  EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
+  EVT PtrTy = getPointerTy(DAG.getDataLayout()),
+      RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
   unsigned NumRegs = LastReg - FirstReg;
 
   if (NumRegs) {
@@ -3787,7 +3794,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
     unsigned Reg = addLiveIn(MF, ArgRegs[I], RC);
     SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegTy);
     FI = MFI->CreateFixedObject(RegSizeInBytes, VaArgOffset, true);
-    SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
                                  MachinePointerInfo(), false, false, 0);
     cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue(
@@ -3920,8 +3927,8 @@ MipsTargetLowering::emitPseudoSELECT(MachineInstr *MI, MachineBasicBlock *BB,
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned MipsTargetLowering::getRegisterByName(const char* RegName,
-                                               EVT VT) const {
+unsigned MipsTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                               SelectionDAG &DAG) const {
   // Named registers is expected to be fairly rare. For now, just support $28
   // since the linux kernel uses it.
   if (Subtarget.isGP64bit()) {
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index bc9a1ce64097..6fe8f830d35d 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -227,7 +227,9 @@ namespace llvm {
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo) const override;
 
-    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i32;
+    }
 
     void LowerOperationWrapper(SDNode *N,
                                SmallVectorImpl<SDValue> &Results,
@@ -247,7 +249,8 @@ namespace llvm {
     const char *getTargetNodeName(unsigned Opcode) const override;
 
     /// getSetCCResultType - get the ISD::SETCC result ValueType
-    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
 
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
@@ -263,7 +266,8 @@ namespace llvm {
 
     void HandleByVal(CCState *, unsigned &, unsigned) const override;
 
-    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
 
   protected:
     SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
@@ -478,8 +482,7 @@ namespace llvm {
     bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
 
     // Inline asm support
-    ConstraintType
-      getConstraintType(const std::string &Constraint) const override;
+    ConstraintType getConstraintType(StringRef Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
@@ -493,8 +496,7 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
@@ -505,8 +507,8 @@ namespace llvm {
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
 
-    unsigned getInlineAsmMemConstraint(
-        const std::string &ConstraintCode) const override {
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "R")
         return InlineAsm::Constraint_R;
       else if (ConstraintCode == "ZC")
@@ -514,8 +516,8 @@ namespace llvm {
       return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
     }
 
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                               unsigned AS) const override;
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
 
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index ec7bf314c641..096b3bee5d07 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -621,10 +621,17 @@ MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
     !MFI->hasVarSizedObjects();
 }
 
-void MipsSEFrameLowering::
-processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS) const {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
+/// Mark \p Reg and all registers aliasing it in the bitset.
+void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs, unsigned Reg) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    SavedRegs.set(*AI);
+}
+
+void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                               BitVector &SavedRegs,
+                                               RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   MipsABIInfo ABI = STI.getABI();
   unsigned FP = ABI.GetFramePtr();
@@ -632,10 +639,10 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   // Mark $fp as used if function has dedicated frame pointer.
   if (hasFP(MF))
-    MRI.setPhysRegUsed(FP);
+    setAliasRegs(MF, SavedRegs, FP);
   // Mark $s7 as used if function has dedicated base pointer.
   if (hasBP(MF))
-    MRI.setPhysRegUsed(BP);
+    setAliasRegs(MF, SavedRegs, BP);
 
   // Create spill slots for eh data registers if function calls eh_return.
   if (MipsFI->callsEhReturn())
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index 2fcd6bbb9a15..9cb32e6c7829 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -34,8 +34,8 @@ public:
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
   unsigned ehDataReg(unsigned I) const;
 };
 
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 990a2f8d8c85..cb46d731da29 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -841,7 +841,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
   }
 
   case MipsISD::ThreadPointer: {
-    EVT PtrVT = getTargetLowering()->getPointerTy();
+    EVT PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
     unsigned RdhwrOpc, DestReg;
 
     if (PtrVT == MVT::i32) {
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index ae2837a8582c..b319fd07884b 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -838,8 +838,9 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
 
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     if (!VT.isVector())
-      return genConstMult(N->getOperand(0), C->getZExtValue(), SDLoc(N),
-                          VT, TL->getScalarShiftAmountTy(VT), DAG);
+      return genConstMult(N->getOperand(0), C->getZExtValue(), SDLoc(N), VT,
+                          TL->getScalarShiftAmountTy(DAG.getDataLayout(), VT),
+                          DAG);
 
   return SDValue(N, 0);
 }
diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.cpp b/lib/Target/Mips/MipsSelectionDAGInfo.cpp
deleted file mode 100644
index edd8f670707f..000000000000
--- a/lib/Target/Mips/MipsSelectionDAGInfo.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- MipsSelectionDAGInfo.cpp - Mips SelectionDAG Info -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the MipsSelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MipsTargetMachine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "mips-selectiondag-info"
-
-MipsSelectionDAGInfo::MipsSelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-MipsSelectionDAGInfo::~MipsSelectionDAGInfo() {
-}
diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.h b/lib/Target/Mips/MipsSelectionDAGInfo.h
deleted file mode 100644
index 061423fbeb86..000000000000
--- a/lib/Target/Mips/MipsSelectionDAGInfo.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- MipsSelectionDAGInfo.h - Mips SelectionDAG Info ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the Mips subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_MIPS_MIPSSELECTIONDAGINFO_H
-#define LLVM_LIB_TARGET_MIPS_MIPSSELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class MipsTargetMachine;
-
-class MipsSelectionDAGInfo : public TargetSelectionDAGInfo {
-public:
-  explicit MipsSelectionDAGInfo(const DataLayout &DL);
-  ~MipsSelectionDAGInfo();
-};
-
-}
-
-#endif
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index c41bb16a58ea..471b6e19a8bb 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -70,7 +70,7 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
       HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
       InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
       HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
-      HasMSA(false), TM(TM), TargetTriple(TT), TSInfo(*TM.getDataLayout()),
+      HasMSA(false), TM(TM), TargetTriple(TT), TSInfo(),
       InstrInfo(
           MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
       FrameLowering(MipsFrameLowering::create(*this)),
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 5f9296812e1c..1db8881404c9 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -18,10 +18,10 @@
 #include "MipsFrameLowering.h"
 #include "MipsISelLowering.h"
 #include "MipsInstrInfo.h"
-#include "MipsSelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -140,7 +140,7 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
 
   Triple TargetTriple;
 
-  const MipsSelectionDAGInfo TSInfo;
+  const TargetSelectionDAGInfo TSInfo;
   std::unique_ptr<const MipsInstrInfo> InstrInfo;
   std::unique_ptr<const MipsFrameLowering> FrameLowering;
   std::unique_ptr<const MipsTargetLowering> TLInfo;
@@ -275,7 +275,7 @@ public:
   void setHelperClassesMips16();
   void setHelperClassesMipsSE();
 
-  const MipsSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
   const MipsInstrInfo *getInstrInfo() const override { return InstrInfo.get(); }
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index c820668befa0..1c77745d130b 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -62,7 +62,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   if (!ABI.IsN64())
     Ret += "-p:32:32";
 
-  // 8 and 16 bit integers only need no have natural alignment, but try to
+  // 8 and 16 bit integers only need to have natural alignment, but try to
   // align them to 32 bits. 64 bit integers have natural alignment.
   Ret += "-i8:8:32-i16:16:32-i64:64";
 
@@ -237,7 +237,7 @@ TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
     if (Subtarget->allowMixed16_32()) {
       DEBUG(errs() << "No Target Transform Info Pass Added\n");
       // FIXME: This is no longer necessary as the TTI returned is per-function.
-      return TargetTransformInfo(getDataLayout());
+      return TargetTransformInfo(F.getParent()->getDataLayout());
     }
 
     DEBUG(errs() << "Target Transform Info Pass Added\n");
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index 221d2f093aeb..ad7302037cad 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -37,7 +37,7 @@ static MCInstrInfo *createNVPTXMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createNVPTXMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   // PTX does not have a return address register.
   InitNVPTXMCRegisterInfo(X, 0);
@@ -46,13 +46,13 @@ static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createNVPTXMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitNVPTXMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createNVPTXMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createNVPTXMCCodeGenInfo(
-    StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) {
+static MCCodeGenInfo *createNVPTXMCCodeGenInfo(const Triple &TT,
+                                               Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
 
   // The default relocation model is used regardless of what the client has
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index cadd7a46cd9d..ecb0f0a1d0a1 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -340,7 +340,7 @@ MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
 }
 
 void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
   const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
 
   Type *Ty = F->getReturnType();
@@ -366,20 +366,20 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
 
       O << ".param .b" << size << " func_retval0";
     } else if (isa<PointerType>(Ty)) {
-      O << ".param .b" << TLI->getPointerTy().getSizeInBits()
+      O << ".param .b" << TLI->getPointerTy(DL).getSizeInBits()
         << " func_retval0";
     } else if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) {
-       unsigned totalsz = TD->getTypeAllocSize(Ty);
+      unsigned totalsz = DL.getTypeAllocSize(Ty);
        unsigned retAlignment = 0;
        if (!llvm::getAlign(*F, 0, retAlignment))
-         retAlignment = TD->getABITypeAlignment(Ty);
+         retAlignment = DL.getABITypeAlignment(Ty);
        O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
          << "]";
     } else
       llvm_unreachable("Unknown return type");
   } else {
     SmallVector<EVT, 16> vtparts;
-    ComputeValueVTs(*TLI, Ty, vtparts);
+    ComputeValueVTs(*TLI, DL, Ty, vtparts);
     unsigned idx = 0;
     for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
       unsigned elems = 1;
@@ -1433,7 +1433,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   bool first = true;
   bool isKernelFunc = llvm::isKernelFunction(*F);
   bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
-  MVT thePointerTy = TLI->getPointerTy();
+  MVT thePointerTy = TLI->getPointerTy(*TD);
 
   O << "(\n";
 
@@ -1579,7 +1579,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       // Further, if a part is vector, print the above for
       // each vector element.
       SmallVector<EVT, 16> vtparts;
-      ComputeValueVTs(*TLI, ETy, vtparts);
+      ComputeValueVTs(*TLI, getDataLayout(), ETy, vtparts);
       for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
         unsigned elems = 1;
         EVT elemtype = vtparts[i];
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 09e0bd5d3d88..b75cf4040312 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -80,14 +80,14 @@ static bool IsPTXVectorType(MVT VT) {
 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
 /// LowerCall, and LowerReturn.
-static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty,
-                               SmallVectorImpl<EVT> &ValueVTs,
+static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                               Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
                                SmallVectorImpl<uint64_t> *Offsets = nullptr,
                                uint64_t StartingOffset = 0) {
   SmallVector<EVT, 16> TempVTs;
   SmallVector<uint64_t, 16> TempOffsets;
 
-  ComputeValueVTs(TLI, Ty, TempVTs, &TempOffsets, StartingOffset);
+  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
     EVT VT = TempVTs[i];
     uint64_t Off = TempOffsets[i];
@@ -885,15 +885,16 @@ SDValue
 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
-  return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
+  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
 }
 
-std::string
-NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
-                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  unsigned retAlignment,
-                                  const ImmutableCallSite *CS) const {
+std::string NVPTXTargetLowering::getPrototype(
+    const DataLayout &DL, Type *retTy, const ArgListTy &Args,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
+    const ImmutableCallSite *CS) const {
+  auto PtrVT = getPointerTy(DL);
 
   bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
@@ -921,13 +922,12 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
 
       O << ".param .b" << size << " _";
     } else if (isa<PointerType>(retTy)) {
-      O << ".param .b" << getPointerTy().getSizeInBits() << " _";
+      O << ".param .b" << PtrVT.getSizeInBits() << " _";
     } else if ((retTy->getTypeID() == Type::StructTyID) ||
                isa<VectorType>(retTy)) {
-      O << ".param .align "
-        << retAlignment
-        << " .b8 _["
-        << getDataLayout()->getTypeAllocSize(retTy) << "]";
+      auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
+      O << ".param .align " << retAlignment << " .b8 _["
+        << DL.getTypeAllocSize(retTy) << "]";
     } else {
       llvm_unreachable("Unknown return type");
     }
@@ -936,7 +936,6 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
   O << "_ (";
 
   bool first = true;
-  MVT thePointerTy = getPointerTy();
 
   unsigned OIdx = 0;
   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
@@ -950,24 +949,23 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
       if (Ty->isAggregateType() || Ty->isVectorTy()) {
         unsigned align = 0;
         const CallInst *CallI = cast<CallInst>(CS->getInstruction());
-        const DataLayout *TD = getDataLayout();
         // +1 because index 0 is reserved for return type alignment
         if (!llvm::getAlign(*CallI, i + 1, align))
-          align = TD->getABITypeAlignment(Ty);
-        unsigned sz = TD->getTypeAllocSize(Ty);
+          align = DL.getABITypeAlignment(Ty);
+        unsigned sz = DL.getTypeAllocSize(Ty);
         O << ".param .align " << align << " .b8 ";
         O << "_";
         O << "[" << sz << "]";
         // update the index for Outs
         SmallVector<EVT, 16> vtparts;
-        ComputeValueVTs(*this, Ty, vtparts);
+        ComputeValueVTs(*this, DL, Ty, vtparts);
         if (unsigned len = vtparts.size())
           OIdx += len - 1;
         continue;
       }
        // i8 types in IR will be i16 types in SDAG
-      assert((getValueType(Ty) == Outs[OIdx].VT ||
-             (getValueType(Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
+      assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
+              (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
              "type mismatch between callee prototype and arguments");
       // scalar type
       unsigned sz = 0;
@@ -976,7 +974,7 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
         if (sz < 32)
           sz = 32;
       } else if (isa<PointerType>(Ty))
-        sz = thePointerTy.getSizeInBits();
+        sz = PtrVT.getSizeInBits();
       else
         sz = Ty->getPrimitiveSizeInBits();
       O << ".param .b" << sz << " ";
@@ -988,7 +986,7 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
     Type *ETy = PTy->getElementType();
 
     unsigned align = Outs[OIdx].Flags.getByValAlign();
-    unsigned sz = getDataLayout()->getTypeAllocSize(ETy);
+    unsigned sz = DL.getTypeAllocSize(ETy);
     O << ".param .align " << align << " .b8 ";
     O << "_";
     O << "[" << sz << "]";
@@ -1002,7 +1000,6 @@ NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
                                           const ImmutableCallSite *CS,
                                           Type *Ty,
                                           unsigned Idx) const {
-  const DataLayout *TD = getDataLayout();
   unsigned Align = 0;
   const Value *DirectCallee = CS->getCalledFunction();
 
@@ -1043,7 +1040,8 @@ NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
 
   // Call is indirect or alignment information is not available, fall back to
   // the ABI type alignment
-  return TD->getABITypeAlignment(Ty);
+  auto &DL = CS->getCaller()->getParent()->getDataLayout();
+  return DL.getABITypeAlignment(Ty);
 }
 
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
@@ -1064,9 +1062,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
     return Chain;
-  const DataLayout *TD = getDataLayout();
   MachineFunction &MF = DAG.getMachineFunction();
   const Function *F = MF.getFunction();
+  auto &DL = MF.getDataLayout();
 
   SDValue tempChain = Chain;
   Chain = DAG.getCALLSEQ_START(Chain,
@@ -1096,11 +1094,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         // aggregate
         SmallVector<EVT, 16> vtparts;
         SmallVector<uint64_t, 16> Offsets;
-        ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0);
+        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets,
+                           0);
 
         unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
         // declare .param .align <align> .b8 .param<n>[<size>];
-        unsigned sz = TD->getTypeAllocSize(Ty);
+        unsigned sz = DL.getTypeAllocSize(Ty);
         SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
         SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl,
                                                              MVT::i32),
@@ -1137,10 +1136,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         continue;
       }
       if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(Ty);
+        EVT ObjectVT = getValueType(DL, Ty);
         unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
         // declare .param .align <align> .b8 .param<n>[<size>];
-        unsigned sz = TD->getTypeAllocSize(Ty);
+        unsigned sz = DL.getTypeAllocSize(Ty);
         SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
         SDValue DeclareParamOps[] = { Chain,
                                       DAG.getConstant(align, dl, MVT::i32),
@@ -1321,7 +1320,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     SmallVector<uint64_t, 16> Offsets;
     const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
     assert(PTy && "Type of a byval parameter should be pointer");
-    ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0);
+    ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(),
+                       vtparts, &Offsets, 0);
 
     // declare .param .align <align> .b8 .param<n>[<size>];
     unsigned sz = Outs[OIdx].Flags.getByValSize();
@@ -1342,9 +1342,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       EVT elemtype = vtparts[j];
       int curOffset = Offsets[j];
       unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
-      SDValue srcAddr =
-          DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx],
-                      DAG.getConstant(curOffset, dl, getPointerTy()));
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
+      SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
+                                    DAG.getConstant(curOffset, dl, PtrVT));
       SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
                                    MachinePointerInfo(), false, false, false,
                                    PartAlign);
@@ -1371,12 +1371,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Handle Result
   if (Ins.size() > 0) {
     SmallVector<EVT, 16> resvtparts;
-    ComputeValueVTs(*this, retTy, resvtparts);
+    ComputeValueVTs(*this, DL, retTy, resvtparts);
 
     // Declare
     //  .param .align 16 .b8 retval0[<size-in-bytes>], or
     //  .param .b<size-in-bits> retval0
-    unsigned resultsz = TD->getTypeAllocSizeInBits(retTy);
+    unsigned resultsz = DL.getTypeAllocSizeInBits(retTy);
     // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
     // these three types to match the logic in
     // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
@@ -1415,7 +1415,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // The prototype is embedded in a string and put as the operand for a
     // CallPrototype SDNode which will print out to the value of the string.
     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    std::string Proto = getPrototype(retTy, Args, Outs, retAlignment, CS);
+    std::string Proto =
+        getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS);
     const char *ProtoStr =
       nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
     SDValue ProtoOps[] = {
@@ -1477,7 +1478,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Generate loads from param memory/moves from registers for result
   if (Ins.size() > 0) {
     if (retTy && retTy->isVectorTy()) {
-      EVT ObjectVT = getValueType(retTy);
+      EVT ObjectVT = getValueType(DL, retTy);
       unsigned NumElts = ObjectVT.getVectorNumElements();
       EVT EltVT = ObjectVT.getVectorElementType();
       assert(STI.getTargetLowering()->getNumRegisters(F->getContext(),
@@ -1590,13 +1591,13 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
               Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
             InVals.push_back(Elt);
           }
-          Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+          Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
         }
       }
     } else {
       SmallVector<EVT, 16> VTs;
       SmallVector<uint64_t, 16> Offsets;
-      ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0);
+      ComputePTXValueVTs(*this, DAG.getDataLayout(), retTy, VTs, &Offsets, 0);
       assert(VTs.size() == Ins.size() && "Bad value decomposition");
       unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0);
       for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
@@ -1608,8 +1609,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
         SmallVector<EVT, 4> LoadRetVTs;
         EVT TheLoadType = VTs[i];
-        if (retTy->isIntegerTy() &&
-            TD->getTypeAllocSizeInBits(retTy) < 32) {
+        if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) {
           // This is for integer types only, and specifically not for
           // aggregates.
           LoadRetVTs.push_back(MVT::i32);
@@ -1920,11 +1920,11 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     }
 
     MemSDNode *MemSD = cast<MemSDNode>(N);
-    const DataLayout *TD = getDataLayout();
+    const DataLayout &TD = DAG.getDataLayout();
 
     unsigned Align = MemSD->getAlignment();
     unsigned PrefAlign =
-      TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
+        TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
     if (Align < PrefAlign) {
       // This store is not sufficiently aligned, so bail out and let this vector
       // store be scalarized.  Note that we may still be able to emit smaller
@@ -2064,7 +2064,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
     SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  const DataLayout *TD = getDataLayout();
+  const DataLayout &DL = DAG.getDataLayout();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   const Function *F = MF.getFunction();
   const AttributeSet &PAL = F->getAttributes();
@@ -2118,7 +2119,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       if (Ty->isAggregateType()) {
         SmallVector<EVT, 16> vtparts;
 
-        ComputePTXValueVTs(*this, Ty, vtparts);
+        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
         assert(vtparts.size() > 0 && "empty aggregate type not expected");
         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
              ++parti) {
@@ -2130,7 +2131,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         continue;
       }
       if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(Ty);
+        EVT ObjectVT = getValueType(DL, Ty);
         unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
         for (unsigned parti = 0; parti < NumRegs; ++parti) {
           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
@@ -2156,13 +2157,14 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         // NOTE: Here, we lose the ability to issue vector loads for vectors
         // that are a part of a struct.  This should be investigated in the
         // future.
-        ComputePTXValueVTs(*this, Ty, vtparts, &offsets, 0);
+        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets,
+                           0);
         assert(vtparts.size() > 0 && "empty aggregate type not expected");
         bool aggregateIsPacked = false;
         if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
           aggregateIsPacked = STy->isPacked();
 
-        SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
              ++parti) {
           EVT partVT = vtparts[parti];
@@ -2170,12 +2172,12 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
               PointerType::get(partVT.getTypeForEVT(F->getContext()),
                                llvm::ADDRESS_SPACE_PARAM));
           SDValue srcAddr =
-              DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
-                          DAG.getConstant(offsets[parti], dl, getPointerTy()));
-          unsigned partAlign =
-              aggregateIsPacked ? 1
-                                : TD->getABITypeAlignment(
-                                      partVT.getTypeForEVT(F->getContext()));
+              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
+                          DAG.getConstant(offsets[parti], dl, PtrVT));
+          unsigned partAlign = aggregateIsPacked
+                                   ? 1
+                                   : DL.getABITypeAlignment(
+                                         partVT.getTypeForEVT(F->getContext()));
           SDValue p;
           if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
             ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
@@ -2198,8 +2200,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         continue;
       }
       if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(Ty);
-        SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+        EVT ObjectVT = getValueType(DL, Ty);
+        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
         unsigned NumElts = ObjectVT.getVectorNumElements();
         assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
                "Vector was not scalarized");
@@ -2212,9 +2214,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           Value *SrcValue = Constant::getNullValue(PointerType::get(
               EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
           SDValue P = DAG.getLoad(
-              EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false,
-              false, true,
-              TD->getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
+              EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
+              true,
+              DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
           if (P.getNode())
             P.getNode()->setIROrder(idx + 1);
 
@@ -2229,9 +2231,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           Value *SrcValue = Constant::getNullValue(PointerType::get(
               VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
           SDValue P = DAG.getLoad(
-              VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false,
-              false, true,
-              TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+              VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
+              true,
+              DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
           if (P.getNode())
             P.getNode()->setIROrder(idx + 1);
 
@@ -2269,13 +2271,12 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             Value *SrcValue = Constant::getNullValue(
                 PointerType::get(VecVT.getTypeForEVT(F->getContext()),
                                  llvm::ADDRESS_SPACE_PARAM));
-            SDValue SrcAddr =
-                DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
-                            DAG.getConstant(Ofst, dl, getPointerTy()));
+            SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
+                                          DAG.getConstant(Ofst, dl, PtrVT));
             SDValue P = DAG.getLoad(
                 VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
                 false, true,
-                TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+                DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
             if (P.getNode())
               P.getNode()->setIROrder(idx + 1);
 
@@ -2288,7 +2289,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
                 Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
               InVals.push_back(Elt);
             }
-            Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+            Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
           }
           InsIdx += NumElts;
         }
@@ -2298,23 +2299,24 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         continue;
       }
       // A plain scalar.
-      EVT ObjectVT = getValueType(Ty);
+      EVT ObjectVT = getValueType(DL, Ty);
       // If ABI, load from the param symbol
-      SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
       Value *srcValue = Constant::getNullValue(PointerType::get(
           ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
       SDValue p;
        if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
         ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
                                        ISD::SEXTLOAD : ISD::ZEXTLOAD;
-        p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, Arg,
-                           MachinePointerInfo(srcValue), ObjectVT, false, false,
-                           false,
-        TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+        p = DAG.getExtLoad(
+            ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue),
+            ObjectVT, false, false, false,
+            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
       } else {
-        p = DAG.getLoad(Ins[InsIdx].VT, dl, Root, Arg,
-                        MachinePointerInfo(srcValue), false, false, false,
-        TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+        p = DAG.getLoad(
+            Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue), false,
+            false, false,
+            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
       }
       if (p.getNode())
         p.getNode()->setIROrder(idx + 1);
@@ -2329,10 +2331,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     // machine instruction fails because TargetExternalSymbol
     // (not lowered) is target dependent, and CopyToReg assumes
     // the source is lowered.
-    EVT ObjectVT = getValueType(Ty);
+    EVT ObjectVT = getValueType(DL, Ty);
     assert(ObjectVT == Ins[InsIdx].VT &&
            "Ins type did not match function type");
-    SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+    SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
     if (p.getNode())
       p.getNode()->setIROrder(idx + 1);
@@ -2370,7 +2372,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   MachineFunction &MF = DAG.getMachineFunction();
   const Function *F = MF.getFunction();
   Type *RetTy = F->getReturnType();
-  const DataLayout *TD = getDataLayout();
+  const DataLayout &TD = DAG.getDataLayout();
 
   bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
@@ -2384,7 +2386,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     assert(NumElts == Outs.size() && "Bad scalarization of return value");
 
     // const_cast can be removed in later LLVM versions
-    EVT EltVT = getValueType(RetTy).getVectorElementType();
+    EVT EltVT = getValueType(TD, RetTy).getVectorElementType();
     bool NeedExtend = false;
     if (EltVT.getSizeInBits() < 16)
       NeedExtend = true;
@@ -2435,7 +2437,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       EVT VecVT =
           EVT::getVectorVT(F->getContext(), EltVT, VecSize);
       unsigned PerStoreOffset =
-          TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+          TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
 
       for (unsigned i = 0; i < NumElts; i += VecSize) {
         // Get values
@@ -2493,7 +2495,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   } else {
     SmallVector<EVT, 16> ValVTs;
     SmallVector<uint64_t, 16> Offsets;
-    ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0);
+    ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0);
     assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
 
     for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
@@ -2509,8 +2511,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                TheValType.getVectorElementType(), TmpVal,
                                DAG.getIntPtrConstant(j, dl));
         EVT TheStoreType = ValVTs[i];
-        if (RetTy->isIntegerTy() &&
-            TD->getTypeAllocSizeInBits(RetTy) < 32) {
+        if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) {
           // The following zero-extension is for integer types only, and
           // specifically not for aggregates.
           TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
@@ -3291,14 +3292,14 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_p: {
-
+    auto &DL = I.getModule()->getDataLayout();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
-      Info.memVT = getValueType(I.getType());
+      Info.memVT = getValueType(DL, I.getType());
     else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
-      Info.memVT = getPointerTy();
+      Info.memVT = getPointerTy(DL);
     else
-      Info.memVT = getValueType(I.getType());
+      Info.memVT = getValueType(DL, I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.vol = 0;
@@ -3311,14 +3312,15 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_ldg_global_i:
   case Intrinsic::nvvm_ldg_global_f:
   case Intrinsic::nvvm_ldg_global_p: {
+    auto &DL = I.getModule()->getDataLayout();
 
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
-      Info.memVT = getValueType(I.getType());
+      Info.memVT = getValueType(DL, I.getType());
     else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
-      Info.memVT = getPointerTy();
+      Info.memVT = getPointerTy(DL);
     else
-      Info.memVT = getValueType(I.getType());
+      Info.memVT = getValueType(DL, I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.vol = 0;
@@ -3731,8 +3733,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
 /// Used to guide target specific optimizations, like loop strength reduction
 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
 /// (CodeGenPrepare.cpp)
-bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                Type *Ty,
+bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
                                                 unsigned AS) const {
 
   // AddrMode - This represents an addressing mode of:
@@ -3772,7 +3774,7 @@ bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 NVPTXTargetLowering::ConstraintType
-NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
+NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:
@@ -3794,7 +3796,7 @@ NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
 
 std::pair<unsigned, const TargetRegisterClass *>
 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                  const std::string &Constraint,
+                                                  StringRef Constraint,
                                                   MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
@@ -4251,7 +4253,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
 
 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
-                              const DataLayout *TD,
                               SmallVectorImpl<SDValue> &Results) {
   EVT ResVT = N->getValueType(0);
   SDLoc DL(N);
@@ -4282,8 +4283,9 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
   LoadSDNode *LD = cast<LoadSDNode>(N);
 
   unsigned Align = LD->getAlignment();
+  auto &TD = DAG.getDataLayout();
   unsigned PrefAlign =
-    TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
+      TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
   if (Align < PrefAlign) {
     // This load is not sufficiently aligned, so bail out and let this vector
     // load be scalarized.  Note that we may still be able to emit smaller
@@ -4495,7 +4497,7 @@ void NVPTXTargetLowering::ReplaceNodeResults(
   default:
     report_fatal_error("Unhandled custom legalization");
   case ISD::LOAD:
-    ReplaceLoadVector(N, DAG, getDataLayout(), Results);
+    ReplaceLoadVector(N, DAG, Results);
     return;
   case ISD::INTRINSIC_W_CHAIN:
     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index ed94775b3002..e5c37321a33b 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -456,24 +456,23 @@ public:
   /// Used to guide target specific optimizations, like loop strength
   /// reduction (LoopStrengthReduce.cpp) and memory optimization for
   /// address mode (CodeGenPrepare.cpp)
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
   /// getFunctionAlignment - Return the Log2 alignment of this function.
   unsigned getFunctionAlignment(const Function *F) const;
 
-  EVT getSetCCResultType(LLVMContext &Ctx, EVT VT) const override {
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
+                         EVT VT) const override {
     if (VT.isVector())
       return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
     return MVT::i1;
   }
 
-  ConstraintType
-  getConstraintType(const std::string &Constraint) const override;
+  ConstraintType getConstraintType(StringRef Constraint) const override;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                               const std::string &Constraint,
-                               MVT VT) const override;
+                               StringRef Constraint, MVT VT) const override;
 
   SDValue LowerFormalArguments(
       SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -483,7 +482,7 @@ public:
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
-  std::string getPrototype(Type *, const ArgListTy &,
+  std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &,
                            const SmallVectorImpl<ISD::OutputArg> &,
                            unsigned retAlignment,
                            const ImmutableCallSite *CS) const;
@@ -501,7 +500,9 @@ public:
   const NVPTXTargetMachine *nvTM;
 
   // PTX always uses 32-bit shift amounts
-  MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+    return MVT::i32;
+  }
 
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 6ab0fadf9a35..0bf72febc4a0 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -57,7 +57,6 @@ char NVPTXLowerAggrCopies::ID = 0;
 // Lower MemTransferInst or load-store pair to loop
 static void convertTransferToLoop(
     Instruction *splitAt, Value *srcAddr, Value *dstAddr, Value *len,
-    //unsigned numLoads,
     bool srcVolatile, bool dstVolatile, LLVMContext &Context, Function &F) {
   Type *indType = len->getType();
 
@@ -84,6 +83,8 @@ static void convertTransferToLoop(
   ind->addIncoming(ConstantInt::get(indType, 0), origBB);
 
   // load from srcAddr+ind
+  // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
+  // word-sized loads and stores.
   Value *val = loop.CreateLoad(loop.CreateGEP(loop.getInt8Ty(), srcAddr, ind),
                                srcVolatile);
   // store at dstAddr+ind
@@ -137,13 +138,10 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   //
   // Collect all the aggrLoads, aggrMemcpys and addrMemsets.
   //
-  //const BasicBlock *firstBB = &F.front();  // first BB in F
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
-    //BasicBlock *bb = BI;
     for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
          ++II) {
       if (LoadInst *load = dyn_cast<LoadInst>(II)) {
-
         if (!load->hasOneUse())
           continue;
 
@@ -152,7 +150,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
 
         User *use = load->user_back();
         if (StoreInst *store = dyn_cast<StoreInst>(use)) {
-          if (store->getOperand(0) != load) //getValueOperand
+          if (store->getOperand(0) != load)
             continue;
           aggrLoads.push_back(load);
         }
@@ -188,8 +186,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   //
   // Do the transformation of an aggr load/copy/set to a loop
   //
-  for (unsigned i = 0, e = aggrLoads.size(); i != e; ++i) {
-    LoadInst *load = aggrLoads[i];
+  for (LoadInst *load : aggrLoads) {
     StoreInst *store = dyn_cast<StoreInst>(*load->user_begin());
     Value *srcAddr = load->getOperand(0);
     Value *dstAddr = store->getOperand(1);
@@ -203,20 +200,19 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
     load->eraseFromParent();
   }
 
-  for (unsigned i = 0, e = aggrMemcpys.size(); i != e; ++i) {
-    MemTransferInst *cpy = aggrMemcpys[i];
-    Value *len = cpy->getLength();
-    // llvm 2.7 version of memcpy does not have volatile
-    // operand yet. So always making it non-volatile
-    // optimistically, so that we don't see unnecessary
-    // st.volatile in ptx
-    convertTransferToLoop(cpy, cpy->getSource(), cpy->getDest(), len, false,
-                          false, Context, F);
+  for (MemTransferInst *cpy : aggrMemcpys) {
+    convertTransferToLoop(/* splitAt */ cpy,
+                          /* srcAddr */ cpy->getSource(),
+                          /* dstAddr */ cpy->getDest(),
+                          /* len */ cpy->getLength(),
+                          /* srcVolatile */ cpy->isVolatile(),
+                          /* dstVolatile */ cpy->isVolatile(),
+                          /* Context */ Context,
+                          /* Function F */ F);
     cpy->eraseFromParent();
   }
 
-  for (unsigned i = 0, e = aggrMemsets.size(); i != e; ++i) {
-    MemSetInst *memsetinst = aggrMemsets[i];
+  for (MemSetInst *memsetinst : aggrMemsets) {
     Value *len = memsetinst->getLength();
     Value *val = memsetinst->getValue();
     convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context,
diff --git a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index e83f735a551e..5a83371b07f1 100644
--- a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -2,7 +2,7 @@
 //
 //                     The LLVM Compiler Infrastructure
 //
-// This file is distributed under the University of Illinois Open Source 
+// This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
@@ -115,7 +115,7 @@ bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) {
 
     replaceImageHandle(Handle, MF);
 
-    return true; 
+    return true;
   }
 
   return false;
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 71645dca69c5..bd2509a3c8c9 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -48,7 +48,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
                                const NVPTXTargetMachine &TM)
     : NVPTXGenSubtargetInfo(TT, CPU, FS), PTXVersion(0), SmVersion(20), TM(TM),
       InstrInfo(), TLInfo(TM, initializeSubtargetDependencies(CPU, FS)),
-      TSInfo(TM.getDataLayout()), FrameLowering() {}
+      FrameLowering() {}
 
 bool NVPTXSubtarget::hasImageHandles() const {
   // Enable handles for Kepler+, where CUDA supports indirect surfaces and
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 9d9072efc382..248f9e117d83 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -148,8 +148,9 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis(
-      [this](Function &) { return TargetTransformInfo(NVPTXTTIImpl(this)); });
+  return TargetIRAnalysis([this](Function &F) {
+    return TargetTransformInfo(NVPTXTTIImpl(this, F));
+  });
 }
 
 void NVPTXPassConfig::addIRPasses() {
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index dc81802f4b5a..e7250cdba5ac 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -94,7 +94,7 @@ unsigned NVPTXTTIImpl::getArithmeticInstrCost(
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
@@ -117,3 +117,15 @@ unsigned NVPTXTTIImpl::getArithmeticInstrCost(
                                          Opd1PropInfo, Opd2PropInfo);
   }
 }
+
+void NVPTXTTIImpl::getUnrollingPreferences(Loop *L,
+                                           TTI::UnrollingPreferences &UP) {
+  BaseT::getUnrollingPreferences(L, UP);
+
+  // Enable partial unrolling and runtime unrolling, but reduce the
+  // threshold.  This partially unrolls small loops which are often
+  // unrolled by the PTX to SASS compiler and unrolling earlier can be
+  // beneficial.
+  UP.Partial = UP.Runtime = true;
+  UP.PartialThreshold = UP.Threshold / 4;
+}
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 4280888988f9..5bcd1e27a558 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -37,8 +37,9 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
   const NVPTXTargetLowering *getTLI() const { return TLI; };
 
 public:
-  explicit NVPTXTTIImpl(const NVPTXTargetMachine *TM)
-      : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
+  explicit NVPTXTTIImpl(const NVPTXTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   NVPTXTTIImpl(const NVPTXTTIImpl &Arg)
@@ -46,18 +47,6 @@ public:
   NVPTXTTIImpl(NVPTXTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  NVPTXTTIImpl &operator=(const NVPTXTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  NVPTXTTIImpl &operator=(NVPTXTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   bool hasBranchDivergence() { return true; }
 
@@ -69,6 +58,8 @@ public:
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index fe168a547597..c0c83cc258b8 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -33,7 +33,6 @@ add_llvm_target(PowerPCCodeGen
   PPCTargetObjectFile.cpp
   PPCTargetTransformInfo.cpp
   PPCTOCRegDeps.cpp
-  PPCSelectionDAGInfo.cpp
   PPCTLSDynamicCall.cpp
   PPCVSXCopy.cpp
   PPCVSXFMAMutate.cpp
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 5c38fe173d96..30f232a9a91e 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -51,10 +51,9 @@ static MCInstrInfo *createPPCMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) {
-  Triple TheTriple(TT);
-  bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
-                  TheTriple.getArch() == Triple::ppc64le);
+static MCRegisterInfo *createPPCMCRegisterInfo(const Triple &TT) {
+  bool isPPC64 =
+      (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le);
   unsigned Flavour = isPPC64 ? 0 : 1;
   unsigned RA = isPPC64 ? PPC::LR8 : PPC::LR;
 
@@ -65,9 +64,7 @@ static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *createPPCMCSubtargetInfo(const Triple &TT,
                                                  StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitPPCMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createPPCMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
 static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
@@ -90,22 +87,20 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createPPCMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
                                              CodeModel::Model CM,
                                              CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
 
   if (RM == Reloc::Default) {
-    Triple T(TT);
-    if (T.isOSDarwin())
+    if (TT.isOSDarwin())
       RM = Reloc::DynamicNoPIC;
     else
       RM = Reloc::Static;
   }
   if (CM == CodeModel::Default) {
-    Triple T(TT);
-    if (!T.isOSDarwin() &&
-        (T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le))
+    if (!TT.isOSDarwin() &&
+        (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le))
       CM = CodeModel::Medium;
   }
   X->initMCCodeGenInfo(RM, CM, OL);
@@ -231,7 +226,7 @@ static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
 static MCTargetStreamer *
 createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   const Triple &TT = STI.getTargetTriple();
-  if (TT.getObjectFormat() == Triple::ELF)
+  if (TT.isOSBinFormatELF())
     return new PPCTargetELFStreamer(S);
   return new PPCTargetMachOStreamer(S);
 }
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 87a5236e711f..199a0debf88b 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -197,7 +197,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
 
     // External or weakly linked global variables need non-lazily-resolved stubs
     if (TM.getRelocationModel() != Reloc::Static &&
-        (GV->isDeclaration() || GV->isWeakForLinker())) {
+        !GV->isStrongDefinitionForLinker()) {
       if (!GV->hasHiddenVisibility()) {
         SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
         MachineModuleInfoImpl::StubValueTy &StubSym = 
@@ -369,28 +369,70 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
     assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
            "High 16 bits of call target should be zero.");
     unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
-    EncodedBytes = 6*4;
+    EncodedBytes = 0;
     // Materialize the jump address:
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8)
                                     .addReg(ScratchReg)
                                     .addImm((CallTarget >> 32) & 0xFFFF));
+    ++EncodedBytes;
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm(32).addImm(16));
+    ++EncodedBytes;
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm((CallTarget >> 16) & 0xFFFF));
+    ++EncodedBytes;
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm(CallTarget & 0xFFFF));
 
+    // Save the current TOC pointer before the remote call.
+    int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD)
+                                    .addReg(PPC::X2)
+                                    .addImm(TOCSaveOffset)
+                                    .addReg(PPC::X1));
+    ++EncodedBytes;
+
+
+    // If we're on ELFv1, then we need to load the actual function pointer from
+    // the function descriptor.
+    if (!Subtarget->isELFv2ABI()) {
+      // Load the new TOC pointer and the function address, but not r11
+      // (needing this is rare, and loading it here would prevent passing it
+      // via a 'nest' parameter.
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+                                      .addReg(PPC::X2)
+                                      .addImm(8)
+                                      .addReg(ScratchReg));
+      ++EncodedBytes;
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+                                      .addReg(ScratchReg)
+                                      .addImm(0)
+                                      .addReg(ScratchReg));
+      ++EncodedBytes;
+    }
+
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8).addReg(ScratchReg));
+    ++EncodedBytes;
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8));
+    ++EncodedBytes;
+
+    // Restore the TOC pointer after the call.
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+                                    .addReg(PPC::X2)
+                                    .addImm(TOCSaveOffset)
+                                    .addReg(PPC::X1));
+    ++EncodedBytes;
   }
 
+  // Each instruction is 4 bytes.
+  EncodedBytes *= 4;
+
   // Emit padding.
   unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
   assert(NumBytes >= EncodedBytes &&
@@ -624,7 +666,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       IsExternal = GV->isDeclaration();
       IsCommon = GV->hasCommonLinkage();
       IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
-        (GV->isDeclaration() || GV->isWeakForLinker());
+        !GV->isStrongDefinitionForLinker();
       IsAvailExt = GV->hasAvailableExternallyLinkage();
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
@@ -706,7 +748,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MOSymbol = getSymbol(GV);
       IsExternal = GV->isDeclaration();
       IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
-        (GV->isDeclaration() || GV->isWeakForLinker());
+        !GV->isStrongDefinitionForLinker();
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
 
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 416131745806..baadf081a64c 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -351,8 +351,9 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
             Opcode = ISD::FTRUNC; break;
           }
 
-          MVT VTy =
-            TLI->getSimpleValueType(CI->getArgOperand(0)->getType(), true);
+          auto &DL = CI->getModule()->getDataLayout();
+          MVT VTy = TLI->getSimpleValueType(DL, CI->getArgOperand(0)->getType(),
+                                            true);
           if (VTy == MVT::Other)
             return true;
           
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index 874a6fce0656..5bc9124f8085 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -133,6 +133,9 @@ def CC_PPC32_SVR4_Common : CallingConv<[
   // register having an odd register number.
   CCIfType<[i32], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>,
 
+  // The 'nest' parameter, if any, is passed in R11.
+  CCIfNest<CCAssignToReg<[R11]>>,
+
   // The first 8 integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
 
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index fafcd76f9d18..5f236f744fc4 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -262,7 +262,7 @@ static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
 // fast-isel, and return its equivalent machine type in VT.
 // FIXME: Copied directly from ARM -- factor into base class?
 bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) {
-  EVT Evt = TLI.getValueType(Ty, true);
+  EVT Evt = TLI.getValueType(DL, Ty, true);
 
   // Only handle simple types.
   if (Evt == MVT::Other || !Evt.isSimple()) return false;
@@ -324,12 +324,13 @@ bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
       return PPCComputeAddress(U->getOperand(0), Addr);
     case Instruction::IntToPtr:
       // Look past no-op inttoptrs.
-      if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+          TLI.getPointerTy(DL))
         return PPCComputeAddress(U->getOperand(0), Addr);
       break;
     case Instruction::PtrToInt:
       // Look past no-op ptrtoints.
-      if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
         return PPCComputeAddress(U->getOperand(0), Addr);
       break;
     case Instruction::GetElementPtr: {
@@ -799,7 +800,7 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
 bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
                              bool IsZExt, unsigned DestReg) {
   Type *Ty = SrcValue1->getType();
-  EVT SrcEVT = TLI.getValueType(Ty, true);
+  EVT SrcEVT = TLI.getValueType(DL, Ty, true);
   if (!SrcEVT.isSimple())
     return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
@@ -893,8 +894,8 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
 // Attempt to fast-select a floating-point extend instruction.
 bool PPCFastISel::SelectFPExt(const Instruction *I) {
   Value *Src  = I->getOperand(0);
-  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
-  EVT DestVT = TLI.getValueType(I->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::f32 || DestVT != MVT::f64)
     return false;
@@ -911,8 +912,8 @@ bool PPCFastISel::SelectFPExt(const Instruction *I) {
 // Attempt to fast-select a floating-point truncate instruction.
 bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
   Value *Src  = I->getOperand(0);
-  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
-  EVT DestVT = TLI.getValueType(I->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::f64 || DestVT != MVT::f32)
     return false;
@@ -992,7 +993,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
     return false;
 
   Value *Src = I->getOperand(0);
-  EVT SrcEVT = TLI.getValueType(Src->getType(), true);
+  EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true);
   if (!SrcEVT.isSimple())
     return false;
 
@@ -1157,7 +1158,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
 // Attempt to fast-select a binary integer operation that isn't already
 // handled automatically.
 bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
-  EVT DestVT  = TLI.getValueType(I->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   // We can get here in the case when we have a binary operation on a non-legal
   // type and the target independent selector doesn't know how to handle it.
@@ -1594,7 +1595,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
 
   if (Ret->getNumOperands() > 0) {
     SmallVector<ISD::OutputArg, 4> Outs;
-    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
@@ -1641,7 +1642,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
         RetRegs.push_back(VA.getLocReg());
         unsigned SrcReg = Reg + VA.getValNo();
 
-        EVT RVEVT = TLI.getValueType(RV->getType());
+        EVT RVEVT = TLI.getValueType(DL, RV->getType());
         if (!RVEVT.isSimple())
           return false;
         MVT RVVT = RVEVT.getSimpleVT();
@@ -1769,8 +1770,8 @@ bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
 // Attempt to fast-select an integer truncate instruction.
 bool PPCFastISel::SelectTrunc(const Instruction *I) {
   Value *Src  = I->getOperand(0);
-  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
-  EVT DestVT = TLI.getValueType(I->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16)
     return false;
@@ -1806,8 +1807,8 @@ bool PPCFastISel::SelectIntExt(const Instruction *I) {
   if (!SrcReg) return false;
 
   EVT SrcEVT, DestEVT;
-  SrcEVT = TLI.getValueType(SrcTy, true);
-  DestEVT = TLI.getValueType(DestTy, true);
+  SrcEVT = TLI.getValueType(DL, SrcTy, true);
+  DestEVT = TLI.getValueType(DL, DestTy, true);
   if (!SrcEVT.isSimple())
     return false;
   if (!DestEVT.isSimple())
@@ -1979,7 +1980,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     // on the "if" path here.
     if (CModel == CodeModel::Large ||
         (GV->getType()->getElementType()->isFunctionTy() &&
-         (GV->isDeclaration() || GV->isWeakForLinker())) ||
+         !GV->isStrongDefinitionForLinker()) ||
         GV->isDeclaration() || GV->hasCommonLinkage() ||
         GV->hasAvailableExternallyLinkage())
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
@@ -2127,7 +2128,7 @@ unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT,
 // Materialize a constant into a register, and return the register
 // number (or zero if we failed to handle it).
 unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
 
   // Only handle simple types.
   if (!CEVT.isSimple()) return 0;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index b4008e4a886a..87229d80d9c1 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -306,9 +306,10 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   DebugLoc dl = MI->getDebugLoc();
 
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
   unsigned UsedRegMask = 0;
   for (unsigned i = 0; i != 32; ++i)
-    if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i]))
+    if (MRI.isPhysRegModified(VRRegNo[i]))
       UsedRegMask |= 1 << (31-i);
 
   // Live in and live out values already must be in the mask, so don't bother
@@ -1158,9 +1159,11 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void
-PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                   RegScavenger *) const {
+void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
   const PPCRegisterInfo *RegInfo =
       static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
 
@@ -1168,8 +1171,7 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   unsigned LR = RegInfo->getRARegister();
   FI->setMustSaveLR(MustSaveLR(MF, LR));
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  MRI.setPhysRegUnused(LR);
+  SavedRegs.reset(LR);
 
   //  Save R31 if necessary
   int FPSI = FI->getFramePointerSaveIndex();
@@ -1214,9 +1216,9 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the
   // function uses CR 2, 3, or 4.
   if (!isPPC64 && !isDarwinABI &&
-      (MRI.isPhysRegUsed(PPC::CR2) ||
-       MRI.isPhysRegUsed(PPC::CR3) ||
-       MRI.isPhysRegUsed(PPC::CR4))) {
+      (SavedRegs.test(PPC::CR2) ||
+       SavedRegs.test(PPC::CR3) ||
+       SavedRegs.test(PPC::CR4))) {
     int FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true);
     FI->setCRSpillFrameIndex(FrameIdx);
   }
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 28d074ecd79d..d6a389bfbf0d 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -45,8 +45,8 @@ public:
   bool needsFP(const MachineFunction &MF) const;
   void replaceFPWithRealFP(MachineFunction &MF) const;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS = nullptr) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                      RegScavenger *RS = nullptr) const override;
   void addScavengingSpillSlot(MachineFunction &MF, RegScavenger *RS) const;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index c85c2610d2f5..01a3acb742e6 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -102,7 +102,8 @@ namespace {
 
     /// getSmallIPtrImm - Return a target constant of pointer type.
     inline SDValue getSmallIPtrImm(unsigned Imm, SDLoc dl) {
-      return CurDAG->getTargetConstant(Imm, dl, PPCLowering->getPointerTy());
+      return CurDAG->getTargetConstant(
+          Imm, dl, PPCLowering->getPointerTy(CurDAG->getDataLayout()));
     }
 
     /// isRotateAndMask - Returns true if Mask and Shift can be folded into a
@@ -313,7 +314,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     const Module *M = MF->getFunction()->getParent();
     DebugLoc dl;
 
-    if (PPCLowering->getPointerTy() == MVT::i32) {
+    if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) {
       if (PPCSubTarget->isTargetELF()) {
         GlobalBaseReg = PPC::R30;
         if (M->getPICLevel() == PICLevel::Small) {
@@ -342,7 +343,8 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     }
   }
   return CurDAG->getRegister(GlobalBaseReg,
-                             PPCLowering->getPointerTy()).getNode();
+                             PPCLowering->getPointerTy(CurDAG->getDataLayout()))
+      .getNode();
 }
 
 /// isIntS16Immediate - This method tests to see if the node is either a 32-bit
@@ -2205,7 +2207,8 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   SDLoc dl(N);
   unsigned Imm;
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
-  EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT =
+      CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
   bool isPPC64 = (PtrVT == MVT::i64);
 
   if (!PPCSubTarget->useCRBits() &&
@@ -2468,10 +2471,11 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Offset, Base, Chain };
-      return transferMemOperands(N, CurDAG->getMachineNode(Opcode, dl,
-                                      LD->getValueType(0),
-                                      PPCLowering->getPointerTy(),
-                                      MVT::Other, Ops));
+      return transferMemOperands(
+          N, CurDAG->getMachineNode(
+                 Opcode, dl, LD->getValueType(0),
+                 PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other,
+                 Ops));
     } else {
       unsigned Opcode;
       bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
@@ -2506,10 +2510,11 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Base, Offset, Chain };
-      return transferMemOperands(N, CurDAG->getMachineNode(Opcode, dl,
-                                      LD->getValueType(0),
-                                      PPCLowering->getPointerTy(),
-                                      MVT::Other, Ops));
+      return transferMemOperands(
+          N, CurDAG->getMachineNode(
+                 Opcode, dl, LD->getValueType(0),
+                 PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other,
+                 Ops));
     }
   }
 
@@ -2662,7 +2667,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   }
   case ISD::SELECT_CC: {
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
-    EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
+    EVT PtrVT =
+        CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
     bool isPPC64 = (PtrVT == MVT::i64);
 
     // If this is a select of i1 operands, we'll pattern match it.
@@ -2901,7 +2907,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
       const GlobalValue *GValue = G->getGlobal();
       if ((GValue->getType()->getElementType()->isFunctionTy() &&
-           (GValue->isDeclaration() || GValue->isWeakForLinker())) ||
+           !GValue->isStrongDefinitionForLinker()) ||
           GValue->isDeclaration() || GValue->hasCommonLinkage() ||
           GValue->hasAvailableExternallyLinkage())
         return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl,
@@ -2915,7 +2921,9 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     // Generate a PIC-safe GOT reference.
     assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() &&
       "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
-    return CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT, PPCLowering->getPointerTy(),  MVT::i32);
+    return CurDAG->SelectNodeTo(
+        N, PPC::PPC32PICGOT, PPCLowering->getPointerTy(CurDAG->getDataLayout()),
+        MVT::i32);
   }
   case PPCISD::VADD_SPLAT: {
     // This expands into one of three sequences, depending on whether
@@ -3398,9 +3406,8 @@ void PPCDAGToDAGISel::PeepholeCROps() {
   bool IsModified;
   do {
     IsModified = false;
-    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-         E = CurDAG->allnodes_end(); I != E; ++I) {
-      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+    for (SDNode &Node : CurDAG->allnodes()) {
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
       if (!MachineNode || MachineNode->use_empty())
         continue;
       SDNode *ResNode = MachineNode;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 594472bbb47b..0ed9b051ffed 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -952,7 +952,8 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
 
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.
-unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
+unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
+                                                  const DataLayout &DL) const {
   // Darwin passes everything on 4 byte boundary.
   if (Subtarget.isDarwin())
     return 4;
@@ -1055,7 +1056,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
 }
 
-EVT PPCTargetLowering::getSetCCResultType(LLVMContext &C, EVT VT) const {
+EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
+                                          EVT VT) const {
   if (!VT.isVector())
     return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
 
@@ -1101,7 +1103,7 @@ static bool isConstantOrUndef(int Op, int Val) {
 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
-  bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian();
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
   if (ShuffleKind == 0) {
     if (IsLE)
       return false;
@@ -1132,7 +1134,7 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
-  bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian();
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
   if (ShuffleKind == 0) {
     if (IsLE)
       return false;
@@ -1174,7 +1176,7 @@ bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
   if (!Subtarget.hasP8Vector())
     return false;
 
-  bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian();
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
   if (ShuffleKind == 0) {
     if (IsLE)
       return false;
@@ -1237,7 +1239,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
 /// the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
                              unsigned ShuffleKind, SelectionDAG &DAG) {
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+  if (DAG.getDataLayout().isLittleEndian()) {
     if (ShuffleKind == 1) // unary
       return isVMerge(N, UnitSize, 0, 0);
     else if (ShuffleKind == 2) // swapped
@@ -1262,7 +1264,7 @@ bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
 /// the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
                              unsigned ShuffleKind, SelectionDAG &DAG) {
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+  if (DAG.getDataLayout().isLittleEndian()) {
     if (ShuffleKind == 1) // unary
       return isVMerge(N, UnitSize, 8, 8);
     else if (ShuffleKind == 2) // swapped
@@ -1352,7 +1354,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
  */
 bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
                               unsigned ShuffleKind, SelectionDAG &DAG) {
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+  if (DAG.getDataLayout().isLittleEndian()) {
     unsigned indexOffset = CheckEven ? 4 : 0;
     if (ShuffleKind == 1) // Unary
       return isVMerge(N, indexOffset, 0);
@@ -1399,7 +1401,7 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
   if (ShiftAmt < i) return -1;
 
   ShiftAmt -= i;
-  bool isLE = DAG.getTarget().getDataLayout()->isLittleEndian();
+  bool isLE = DAG.getDataLayout().isLittleEndian();
 
   if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
     // Check the rest of the elements to see if they are consecutive.
@@ -1456,7 +1458,7 @@ unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
                                 SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   assert(isSplatShuffleMask(SVOp, EltSize));
-  if (DAG.getTarget().getDataLayout()->isLittleEndian())
+  if (DAG.getDataLayout().isLittleEndian())
     return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
   else
     return SVOp->getMaskElt(0) / EltSize;
@@ -1796,7 +1798,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     }
   }
 
-  Disp = DAG.getTargetConstant(0, dl, getPointerTy());
+  Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));
   if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
     Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
     fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
@@ -2084,7 +2086,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   SDLoc dl(GA);
   const GlobalValue *GV = GA->getGlobal();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   bool is64bit = Subtarget.isPPC64();
   const Module *M = DAG.getMachineFunction().getFunction()->getParent();
   PICLevel::Level picLevel = M->getPICLevel();
@@ -2270,7 +2272,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
                                       const PPCSubtarget &Subtarget) const {
   SDNode *Node = Op.getNode();
   EVT VT = Node->getValueType(0);
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   SDValue InChain = Node->getOperand(0);
   SDValue VAListPtr = Node->getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
@@ -2399,11 +2401,9 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   SDLoc dl(Op);
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   bool isPPC64 = (PtrVT == MVT::i64);
-  Type *IntPtrTy =
-    DAG.getTargetLoweringInfo().getDataLayout()->getIntPtrType(
-                                                             *DAG.getContext());
+  Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -2440,7 +2440,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
   if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
     // memory location argument.
-    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
     const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
     return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
@@ -2476,8 +2476,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
   SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
   SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
 
-
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
 
   SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
                                             PtrVT);
@@ -2797,7 +2796,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
   MachineFrameInfo *MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
@@ -3023,7 +3022,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   assert(!(CallConv == CallingConv::Fast && isVarArg) &&
          "fastcc not supported on varargs functions");
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
@@ -3059,12 +3058,16 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   unsigned NumBytes = LinkageSize;
   unsigned AvailableFPRs = Num_FPR_Regs;
   unsigned AvailableVRs = Num_VR_Regs;
-  for (unsigned i = 0, e = Ins.size(); i != e; ++i)
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    if (Ins[i].Flags.isNest())
+      continue;
+
     if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
                                PtrByteSize, LinkageSize, ParamAreaSize,
                                NumBytes, AvailableFPRs, AvailableVRs,
                                Subtarget.hasQPX()))
       HasParameterArea = true;
+  }
 
   // Add DAG nodes to load the arguments or copy them out of registers.  On
   // entry to a function on PPC, the arguments start after the linkage area,
@@ -3216,6 +3219,17 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     case MVT::i1:
     case MVT::i32:
     case MVT::i64:
+      if (Flags.isNest()) {
+        // The 'nest' parameter, if any, is passed in R11.
+        unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+
+        break;
+      }
+
       // These can be scalar arguments or elements of an integer array type
       // passed directly.  Clang may use those instead of "byval" aggregate
       // types to avoid forcing arguments to memory unnecessarily.
@@ -3425,7 +3439,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   MachineFrameInfo *MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
   bool isPPC64 = PtrVT == MVT::i64;
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
@@ -3845,7 +3859,8 @@ static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
     return nullptr;  // Top 6 bits have to be sext of immediate.
 
   return DAG.getConstant((int)C->getZExtValue() >> 2, SDLoc(Op),
-                         DAG.getTargetLoweringInfo().getPointerTy()).getNode();
+                         DAG.getTargetLoweringInfo().getPointerTy(
+                             DAG.getDataLayout())).getNode();
 }
 
 namespace {
@@ -3991,7 +4006,7 @@ LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
                  bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
                  SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments,
                  SDLoc dl) {
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   if (!isTailCall) {
     if (isVector) {
       SDValue StackPtr;
@@ -4053,7 +4068,7 @@ static bool isFunctionGlobalAddress(SDValue Callee) {
 static
 unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
                      SDValue &Chain, SDValue CallSeqStart, SDLoc dl, int SPDiff,
-                     bool isTailCall, bool IsPatchPoint,
+                     bool isTailCall, bool IsPatchPoint, bool hasNest,
                      SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass,
                      SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
                      ImmutableCallSite *CS, const PPCSubtarget &Subtarget) {
@@ -4062,7 +4077,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
   bool isSVR4ABI = Subtarget.isSVR4ABI();
   bool isELFv2ABI = Subtarget.isELFv2ABI();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   NodeTys.push_back(MVT::Other);   // Returns a chain
   NodeTys.push_back(MVT::Glue);    // Returns a flag for retval copy to use.
 
@@ -4084,8 +4099,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
          (Subtarget.getTargetTriple().isMacOSX() &&
           Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
-         (G->getGlobal()->isDeclaration() ||
-          G->getGlobal()->isWeakForLinker())) ||
+         !G->getGlobal()->isStrongDefinitionForLinker()) ||
         (Subtarget.isTargetELF() && !isPPC64 &&
          !G->getGlobal()->hasLocalLinkage() &&
          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
@@ -4196,11 +4210,15 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       Chain = TOCVal.getValue(0);
       InFlag = TOCVal.getValue(1);
 
-      SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
-                                        InFlag);
+      // If the function call has an explicit 'nest' parameter, it takes the
+      // place of the environment pointer.
+      if (!hasNest) {
+        SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
+                                          InFlag);
 
-      Chain = EnvVal.getValue(0);
-      InFlag = EnvVal.getValue(1);
+        Chain = EnvVal.getValue(0);
+        InFlag = EnvVal.getValue(1);
+      }
 
       MTCTROps[0] = Chain;
       MTCTROps[1] = LoadFuncPtr;
@@ -4218,7 +4236,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     CallOpc = PPCISD::BCTRL;
     Callee.setNode(nullptr);
     // Add use of X11 (holding environment pointer)
-    if (isSVR4ABI && isPPC64 && !isELFv2ABI)
+    if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest)
       Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
     // Add CTR register as callee so a bctr can be emitted later.
     if (isTailCall)
@@ -4254,8 +4272,7 @@ static
 bool isLocalCall(const SDValue &Callee)
 {
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    return !G->getGlobal()->isDeclaration() &&
-           !G->getGlobal()->isWeakForLinker();
+    return G->getGlobal()->isStrongDefinitionForLinker();
   return false;
 }
 
@@ -4308,7 +4325,7 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 SDValue
 PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
                               bool isTailCall, bool isVarArg, bool IsPatchPoint,
-                              SelectionDAG &DAG,
+                              bool hasNest, SelectionDAG &DAG,
                               SmallVector<std::pair<unsigned, SDValue>, 8>
                                 &RegsToPass,
                               SDValue InFlag, SDValue Chain,
@@ -4321,8 +4338,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
   std::vector<EVT> NodeTys;
   SmallVector<SDValue, 8> Ops;
   unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
-                                 SPDiff, isTailCall, IsPatchPoint, RegsToPass,
-                                 Ops, NodeTys, CS, Subtarget);
+                                 SPDiff, isTailCall, IsPatchPoint, hasNest,
+                                 RegsToPass, Ops, NodeTys, CS, Subtarget);
 
   // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
   if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
@@ -4381,7 +4398,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
       // allocated and an unnecessary move instruction being generated.
       CallOpc = PPCISD::BCTRL_LOAD_TOC;
 
-      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
       SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
       unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
       SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
@@ -4586,7 +4603,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
       unsigned LocMemOffset = ByValVA.getLocMemOffset();
 
       SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-      PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+      PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
+                           StackPtr, PtrOff);
 
       // Create a copy of the argument in the local area of the current
       // stack frame.
@@ -4623,7 +4641,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
 
       if (!isTailCall) {
         SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-        PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+        PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
+                             StackPtr, PtrOff);
 
         MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
                                            MachinePointerInfo(),
@@ -4664,7 +4683,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
     PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
                     false, TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG,
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint,
+                    /* unused except on PPC64 ELFv1 */ false, DAG,
                     RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
                     NumBytes, Ins, InVals, CS);
 }
@@ -4703,8 +4723,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   bool isELFv2ABI = Subtarget.isELFv2ABI();
   bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned NumOps = Outs.size();
+  bool hasNest = false;
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   unsigned PtrByteSize = 8;
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -4758,6 +4779,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     EVT ArgVT = Outs[i].VT;
     EVT OrigVT = Outs[i].ArgVT;
 
+    if (Flags.isNest())
+      continue;
+
     if (CallConv == CallingConv::Fast) {
       if (Flags.isByVal())
         NumGPRsUsed += (Flags.getByValSize()+7)/8;
@@ -5021,6 +5045,13 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     case MVT::i1:
     case MVT::i32:
     case MVT::i64:
+      if (Flags.isNest()) {
+        // The 'nest' parameter, if any, is passed in R11.
+        RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
+        hasNest = true;
+        break;
+      }
+
       // These can be scalar arguments or elements of an integer array type
       // passed directly.  Clang may use those instead of "byval" aggregate
       // types to avoid forcing arguments to memory unnecessarily.
@@ -5302,9 +5333,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
                     FPOp, true, TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG,
-                    RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
-                    NumBytes, Ins, InVals, CS);
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint,
+		    hasNest, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
+                    Callee, SPDiff, NumBytes, Ins, InVals, CS);
 }
 
 SDValue
@@ -5320,7 +5351,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
 
   unsigned NumOps = Outs.size();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   bool isPPC64 = PtrVT == MVT::i64;
   unsigned PtrByteSize = isPPC64 ? 8 : 4;
 
@@ -5693,7 +5724,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
     PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp,
                     FPOp, true, TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG,
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint,
+                    /* unused except on PPC64 ELFv1 */ false, DAG,
                     RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
                     NumBytes, Ins, InVals, CS);
 }
@@ -5764,7 +5796,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
 
   // Get the corect type for pointers.
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
 
   // Construct the stack pointer operand.
   bool isPPC64 = Subtarget.isPPC64();
@@ -5794,7 +5826,7 @@ SDValue
 PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool isPPC64 = Subtarget.isPPC64();
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
 
   // Get current frame pointer save index.  The users of this index will be
   // primarily DYNALLOC instructions.
@@ -5817,7 +5849,7 @@ SDValue
 PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool isPPC64 = Subtarget.isPPC64();
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
 
   // Get current frame pointer save index.  The users of this index will be
   // primarily DYNALLOC instructions.
@@ -5845,7 +5877,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDLoc dl(Op);
 
   // Get the corect type for pointers.
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   // Negate the size.
   SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
                                 DAG.getConstant(0, dl, PtrVT), Size);
@@ -5888,8 +5920,9 @@ SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDValue BasePtr = LD->getBasePtr();
   MachineMemOperand *MMO = LD->getMemOperand();
 
-  SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(), Chain,
-                                 BasePtr, MVT::i8, MMO);
+  SDValue NewLD =
+      DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,
+                     BasePtr, MVT::i8, MMO);
   SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
 
   SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
@@ -5913,7 +5946,8 @@ SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDValue Value = ST->getValue();
   MachineMemOperand *MMO = ST->getMemOperand();
 
-  Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(), Value);
+  Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),
+                      Value);
   return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
 }
 
@@ -6374,7 +6408,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                  SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
                SINT.getOperand(0).getValueType() == MVT::i32) {
       MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
 
       int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
       SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
@@ -6419,7 +6453,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
   // then lfd it and fcfid it.
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
 
   SDValue Ld;
   if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
@@ -6506,7 +6540,7 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 
   MachineFunction &MF = DAG.getMachineFunction();
   EVT VT = Op.getValueType();
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
 
   // Save FP Control Word to register
   EVT NodeTys[] = {
@@ -6727,7 +6761,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
     int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
     MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
-    EVT PtrVT = getPointerTy();
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
     SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
     assert(BVN->getNumOperands() == 4 &&
@@ -6760,9 +6794,9 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       }
 
       Constant *CP = ConstantVector::get(CV);
-      SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(),
-                      16 /* alignment */);
- 
+      SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()),
+                                          16 /* alignment */);
+
       SmallVector<SDValue, 2> Ops;
       Ops.push_back(DAG.getEntryNode());
       Ops.push_back(CPIdx);
@@ -7453,7 +7487,7 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
   // Create a stack slot that is 16-byte aligned.
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   // Store the input value into Value#0 of the stack slot.
@@ -7499,7 +7533,7 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   SDValue StoreChain = DAG.getEntryNode();
@@ -7651,9 +7685,9 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
 
     SmallVector<SDValue, 8> Stores;
     for (unsigned Idx = 0; Idx < 4; ++Idx) {
-      SDValue Ex =
-        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
-                    DAG.getConstant(Idx, dl, getVectorIdxTy()));
+      SDValue Ex = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
+          DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout())));
       SDValue Store;
       if (ScalarVT != ScalarMemVT)
         Store =
@@ -7715,7 +7749,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   SmallVector<SDValue, 2> Ops;
@@ -7920,7 +7954,8 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
 
     assert(N->getValueType(0) == MVT::i1 &&
            "Unexpected result type for CTR decrement intrinsic");
-    EVT SVT = getSetCCResultType(*DAG.getContext(), N->getValueType(0));
+    EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                 N->getValueType(0));
     SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
     SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
                                  N->getOperand(1)); 
@@ -8248,7 +8283,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
 
-  MVT PVT = getPointerTy();
+  MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
   // For v = setjmp(buf), we generate
@@ -8386,7 +8421,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
 
-  MVT PVT = getPointerTy();
+  MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
 
@@ -9032,6 +9067,19 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 // Target Optimization Hooks
 //===----------------------------------------------------------------------===//
 
+static std::string getRecipOp(const char *Base, EVT VT) {
+  std::string RecipOp(Base);
+  if (VT.getScalarType() == MVT::f64)
+    RecipOp += "d";
+  else
+    RecipOp += "f";
+
+  if (VT.isVector())
+    RecipOp = "vec-" + RecipOp;
+
+  return RecipOp;
+}
+
 SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand,
                                             DAGCombinerInfo &DCI,
                                             unsigned &RefinementSteps,
@@ -9043,13 +9091,12 @@ SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand,
       (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
       (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
       (VT == MVT::v4f64 && Subtarget.hasQPX())) {
-    // Convergence is quadratic, so we essentially double the number of digits
-    // correct after every iteration. For both FRE and FRSQRTE, the minimum
-    // architected relative accuracy is 2^-5. When hasRecipPrec(), this is
-    // 2^-14. IEEE float has 23 digits and double has 52 digits.
-    RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
-    if (VT.getScalarType() == MVT::f64)
-      ++RefinementSteps;
+    TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+    std::string RecipOp = getRecipOp("sqrt", VT);
+    if (!Recips.isEnabled(RecipOp))
+      return SDValue();
+
+    RefinementSteps = Recips.getRefinementSteps(RecipOp);
     UseOneConstNR = true;
     return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
   }
@@ -9066,13 +9113,12 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand,
       (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
       (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
       (VT == MVT::v4f64 && Subtarget.hasQPX())) {
-    // Convergence is quadratic, so we essentially double the number of digits
-    // correct after every iteration. For both FRE and FRSQRTE, the minimum
-    // architected relative accuracy is 2^-5. When hasRecipPrec(), this is
-    // 2^-14. IEEE float has 23 digits and double has 52 digits.
-    RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
-    if (VT.getScalarType() == MVT::f64)
-      ++RefinementSteps;
+    TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+    std::string RecipOp = getRecipOp("div", VT);
+    if (!Recips.isEnabled(RecipOp))
+      return SDValue();
+
+    RefinementSteps = Recips.getRefinementSteps(RecipOp);
     return DCI.DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
   }
   return SDValue();
@@ -9854,7 +9900,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
 
   assert(N->getOpcode() == ISD::SIGN_EXTEND &&
          "Invalid extension type");
-  EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0));
+  EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
   SDValue ShiftCst =
     DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
   return DAG.getNode(ISD::SRA, dl, N->getValueType(0), 
@@ -10145,9 +10191,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 
     EVT MemVT = LD->getMemoryVT();
     Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
     Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());
-    unsigned ScalarABIAlignment = getDataLayout()->getABITypeAlignment(STy);
+    unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy);
     if (LD->isUnindexed() && VT.isVector() &&
         ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
           // P8 and later hardware should just use LOAD.
@@ -10219,7 +10265,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                 2*MemVT.getStoreSize()-1);
 
       // Create the new base load.
-      SDValue LDXIntID = DAG.getTargetConstant(IntrLD, dl, getPointerTy());
+      SDValue LDXIntID =
+          DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));
       SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
       SDValue BaseLoad =
         DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
@@ -10243,7 +10290,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       if (!findConsecutiveLoad(LD, DAG))
         --IncValue;
 
-      SDValue Increment = DAG.getConstant(IncValue, dl, getPointerTy());
+      SDValue Increment =
+          DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
 
       MachineMemOperand *ExtraMMO =
@@ -10691,7 +10739,7 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
 /// getConstraintType - Given a constraint, return the type of
 /// constraint it is for this target.
 PPCTargetLowering::ConstraintType
-PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
+PPCTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default: break;
@@ -10776,7 +10824,7 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
 
 std::pair<unsigned, const TargetRegisterClass *>
 PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                const std::string &Constraint,
+                                                StringRef Constraint,
                                                 MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC RS6000 Constraint Letters
@@ -10923,8 +10971,8 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
 // isLegalAddressingMode - Return true if the addressing mode represented
 // by AM is legal for this target, for a load/store of the specified type.
-bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              Type *Ty,
+bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
                                               unsigned AS) const {
   // PPC does not allow r+i addressing modes for vectors!
   if (Ty->isVectorTy() && AM.BaseOffs != 0)
@@ -10977,22 +11025,22 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   FuncInfo->setLRStoreRequired();
   bool isPPC64 = Subtarget.isPPC64();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset =
         DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
                         isPPC64 ? MVT::i64 : MVT::i32);
-    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                                   FrameAddr, Offset),
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
                        MachinePointerInfo(), false, false, false, 0);
   }
 
   // Just load the return address off the stack.
   SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
-  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+                     MachinePointerInfo(), false, false, false, 0);
 }
 
 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
@@ -11000,13 +11048,13 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
   SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  bool isPPC64 = PtrVT == MVT::i64;
-
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
 
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
+  bool isPPC64 = PtrVT == MVT::i64;
+
   // Naked functions never have a frame pointer, and so we use r1. For all
   // other functions, this decision must be delayed until during PEI.
   unsigned FrameReg;
@@ -11026,8 +11074,8 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned PPCTargetLowering::getRegisterByName(const char* RegName,
-                                              EVT VT) const {
+unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              SelectionDAG &DAG) const {
   bool isPPC64 = Subtarget.isPPC64();
   bool isDarwinABI = Subtarget.isDarwinABI();
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 02242b512a4f..6e13533cfdb3 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -423,7 +423,9 @@ namespace llvm {
     /// DAG node.
     const char *getTargetNodeName(unsigned Opcode) const override;
 
-    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i32;
+    }
 
     bool isCheapToSpeculateCttz() const override {
       return true;
@@ -434,7 +436,8 @@ namespace llvm {
     }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
 
     /// Return true if target always beneficiates from combining into FMA for a
     /// given value type. This must typically return false on targets where FMA
@@ -487,7 +490,8 @@ namespace llvm {
     SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                           std::vector<SDNode *> *Created) const override;
 
-    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
 
     void computeKnownBitsForTargetNode(const SDValue Op,
                                        APInt &KnownZero,
@@ -519,8 +523,7 @@ namespace llvm {
     MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
                                          MachineBasicBlock *MBB) const;
 
-    ConstraintType
-    getConstraintType(const std::string &Constraint) const override;
+    ConstraintType getConstraintType(StringRef Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
@@ -529,13 +532,13 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area.  This is the actual
     /// alignment, not its logarithm.
-    unsigned getByValTypeAlignment(Type *Ty) const override;
+    unsigned getByValTypeAlignment(Type *Ty,
+                                   const DataLayout &DL) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops.
@@ -544,8 +547,8 @@ namespace llvm {
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
 
-    unsigned getInlineAsmMemConstraint(
-        const std::string &ConstraintCode) const override {
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "es")
         return InlineAsm::Constraint_es;
       else if (ConstraintCode == "o")
@@ -561,8 +564,8 @@ namespace llvm {
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                               unsigned AS) const override;
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
@@ -745,7 +748,7 @@ namespace llvm {
                             SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
     SDValue FinishCall(CallingConv::ID CallConv, SDLoc dl, bool isTailCall,
-                       bool isVarArg, bool IsPatchPoint,
+                       bool isVarArg, bool IsPatchPoint, bool hasNest,
                        SelectionDAG &DAG,
                        SmallVector<std::pair<unsigned, SDValue>, 8>
                          &RegsToPass,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 696a83860e53..bf6e40296405 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -57,6 +57,10 @@ static cl::opt<bool> VSXSelfCopyCrash("crash-on-ppc-vsx-self-copy",
 cl::desc("Causes the backend to crash instead of generating a nop VSX copy"),
 cl::Hidden);
 
+static cl::opt<bool>
+UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
+  cl::desc("Use the old (incorrect) instruction latency calculation"));
+
 // Pin the vtable to this file.
 void PPCInstrInfo::anchor() {}
 
@@ -103,6 +107,35 @@ PPCInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
   return new ScoreboardHazardRecognizer(II, DAG);
 }
 
+unsigned PPCInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                       const MachineInstr *MI,
+                                       unsigned *PredCost) const {
+  if (!ItinData || UseOldLatencyCalc)
+    return PPCGenInstrInfo::getInstrLatency(ItinData, MI, PredCost);
+
+  // The default implementation of getInstrLatency calls getStageLatency, but
+  // getStageLatency does not do the right thing for us. While we have
+  // itinerary, most cores are fully pipelined, and so the itineraries only
+  // express the first part of the pipeline, not every stage. Instead, we need
+  // to use the listed output operand cycle number (using operand 0 here, which
+  // is an output).
+
+  unsigned Latency = 1;
+  unsigned DefClass = MI->getDesc().getSchedClass();
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef() || MO.isImplicit())
+      continue;
+
+    int Cycle = ItinData->getOperandCycle(DefClass, i);
+    if (Cycle < 0)
+      continue;
+
+    Latency = std::max(Latency, (unsigned) Cycle);
+  }
+
+  return Latency;
+}
 
 int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
                                     const MachineInstr *DefMI, unsigned DefIdx,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index e2d6346aa532..40badae644d6 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -95,6 +95,10 @@ public:
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
                                      const ScheduleDAG *DAG) const override;
 
+  unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                           const MachineInstr *MI,
+                           unsigned *PredCost = nullptr) const override;
+
   int getOperandLatency(const InstrItineraryData *ItinData,
                         const MachineInstr *DefMI, unsigned DefIdx,
                         const MachineInstr *UseMI,
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 43ba4994fde6..20c95fe888e0 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -989,6 +989,18 @@ def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B),
 def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B),
           (XVDIVDP $A, $B)>;
 
+// Reciprocal estimate
+def : Pat<(int_ppc_vsx_xvresp v4f32:$A),
+          (XVRESP $A)>;
+def : Pat<(int_ppc_vsx_xvredp v2f64:$A),
+          (XVREDP $A)>;
+
+// Recip. square root estimate
+def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A),
+          (XVRSQRTESP $A)>;
+def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A),
+          (XVRSQRTEDP $A)>;
+
 } // AddedComplexity
 } // HasVSX
 
@@ -1013,6 +1025,9 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                                                     v4i32:$XB)))]>;
   } // isCommutable
 
+  def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B),
+            (XXLEQV $A, $B)>;
+
   def XXLORC : XX3Form<60, 170,
                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                        "xxlorc $XT, $XA, $XB", IIC_VecGeneral,
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 656376c641aa..2b09b2f625de 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -165,8 +165,7 @@ void PPCRegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
 BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
-  const PPCFrameLowering *PPCFI =
-      static_cast<const PPCFrameLowering *>(Subtarget.getFrameLowering());
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
 
   // The ZERO register is not really a register, but the representation of r0
   // when used in instructions that treat r0 as the constant 0.
@@ -209,7 +208,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(PPC::X1);
     Reserved.set(PPC::X13);
 
-    if (PPCFI->needsFP(MF))
+    if (TFI->needsFP(MF))
       Reserved.set(PPC::X31);
 
     if (hasBasePointer(MF))
@@ -230,7 +229,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     }
   }
 
-  if (PPCFI->needsFP(MF))
+  if (TFI->needsFP(MF))
     Reserved.set(PPC::R31);
 
   if (hasBasePointer(MF)) {
@@ -256,8 +255,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
 unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                               MachineFunction &MF) const {
-  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
-  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
   const unsigned DefaultSafety = 1;
 
   switch (RC->getID()) {
@@ -341,7 +339,8 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   unsigned FrameSize = MFI->getStackSize();
   
   // Get stack alignments.
-  unsigned TargetAlign = Subtarget.getFrameLowering()->getStackAlignment();
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
+  unsigned TargetAlign = TFI->getStackAlignment();
   unsigned MaxAlign = MFI->getMaxAlignment();
   assert((maxCallFrameSize & (MaxAlign-1)) == 0 &&
          "Maximum call-frame size not sufficiently aligned");
@@ -864,8 +863,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 }
 
 unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
-  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
 
   if (!TM.isPPC64())
     return TFI->hasFP(MF) ? PPC::R31 : PPC::R1;
@@ -908,10 +906,10 @@ bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const {
 }
 
 bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
-  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
-  unsigned StackAlign = Subtarget.getFrameLowering()->getStackAlignment();
+  unsigned StackAlign = TFI->getStackAlignment();
   bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
                               F->hasFnAttribute(Attribute::StackAlignment));
 
@@ -946,11 +944,8 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
 
   MachineBasicBlock &MBB = *MI->getParent();
   MachineFunction &MF = *MBB.getParent();
-  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
-  const PPCFrameLowering *PPCFI =
-      static_cast<const PPCFrameLowering *>(Subtarget.getFrameLowering());
-  unsigned StackEst =
-    PPCFI->determineFrameLayout(MF, false, true);
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
+  unsigned StackEst = TFI->determineFrameLayout(MF, false, true);
 
   // If we likely don't need a stack frame, then we probably don't need a
   // virtual base register either.
@@ -1034,4 +1029,3 @@ bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
          MI->getOpcode() == TargetOpcode::PATCHPOINT ||
          (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0));
 }
-
diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td
index 635d154d10bf..267f56726180 100644
--- a/lib/Target/PowerPC/PPCScheduleP7.td
+++ b/lib/Target/PowerPC/PPCScheduleP7.td
@@ -315,6 +315,10 @@ def P7Itineraries : ProcessorItineraries<
                                                   P7_DU3, P7_DU4], 0>,
                                    InstrStage<1, [P7_VS1, P7_VS2]>],
                                   [5, 1, 1]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [5, 1, 1]>,
   InstrItinData<IIC_FPCompare   , [InstrStage<1, [P7_DU1, P7_DU2,
                                                   P7_DU3, P7_DU4], 0>,
                                    InstrStage<1, [P7_VS1, P7_VS2]>],
diff --git a/lib/Target/PowerPC/PPCScheduleP8.td b/lib/Target/PowerPC/PPCScheduleP8.td
index 020739baec3a..69e6d05c6604 100644
--- a/lib/Target/PowerPC/PPCScheduleP8.td
+++ b/lib/Target/PowerPC/PPCScheduleP8.td
@@ -323,6 +323,10 @@ def P8Itineraries : ProcessorItineraries<
                                                   P8_DU4, P8_DU5, P8_DU6], 0>,
                                    InstrStage<1, [P8_FPU1, P8_FPU2]>],
                                   [5, 1, 1]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [5, 1, 1]>,
   InstrItinData<IIC_FPCompare   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
                                                   P8_DU4, P8_DU5, P8_DU6], 0>,
                                    InstrStage<1, [P8_FPU1, P8_FPU2]>],
diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
deleted file mode 100644
index dc1674214769..000000000000
--- a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//===-- PPCSelectionDAGInfo.cpp - PowerPC SelectionDAG Info ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the PPCSelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PPCTargetMachine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "powerpc-selectiondag-info"
-
-PPCSelectionDAGInfo::PPCSelectionDAGInfo(const DataLayout *DL)
-    : TargetSelectionDAGInfo(DL) {}
-
-PPCSelectionDAGInfo::~PPCSelectionDAGInfo() {}
diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/lib/Target/PowerPC/PPCSelectionDAGInfo.h
deleted file mode 100644
index 2c1378d5670d..000000000000
--- a/lib/Target/PowerPC/PPCSelectionDAGInfo.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- PPCSelectionDAGInfo.h - PowerPC SelectionDAG Info -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the PowerPC subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_POWERPC_PPCSELECTIONDAGINFO_H
-#define LLVM_LIB_TARGET_POWERPC_PPCSELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class PPCTargetMachine;
-
-class PPCSelectionDAGInfo : public TargetSelectionDAGInfo {
-public:
-  explicit PPCSelectionDAGInfo(const DataLayout *DL);
-  ~PPCSelectionDAGInfo();
-};
-
-}
-
-#endif
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index cf603fe17723..58daccae90f2 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -53,7 +53,7 @@ PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU,
       IsPPC64(TargetTriple.getArch() == Triple::ppc64 ||
               TargetTriple.getArch() == Triple::ppc64le),
       TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, FS)),
-      InstrInfo(*this), TLInfo(TM, *this), TSInfo(TM.getDataLayout()) {}
+      InstrInfo(*this), TLInfo(TM, *this) {}
 
 void PPCSubtarget::initializeEnvironment() {
   StackAlignment = 16;
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index e9cc3d4bd5bc..0616c1f65604 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -17,10 +17,10 @@
 #include "PPCFrameLowering.h"
 #include "PPCISelLowering.h"
 #include "PPCInstrInfo.h"
-#include "PPCSelectionDAGInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -129,7 +129,7 @@ protected:
   PPCFrameLowering FrameLowering;
   PPCInstrInfo InstrInfo;
   PPCTargetLowering TLInfo;
-  PPCSelectionDAGInfo TSInfo;
+  TargetSelectionDAGInfo TSInfo;
 
 public:
   /// This constructor initializes the data members to match that
@@ -164,7 +164,7 @@ public:
   const PPCTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const PPCSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
   const PPCRegisterInfo *getRegisterInfo() const override {
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 074bc870751a..1daf244fed44 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -172,7 +172,26 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU,
                         computeFSAdditions(FS, OL, TT), Options, RM, CM, OL),
       TLOF(createTLOF(getTargetTriple())),
-      TargetABI(computeTargetABI(TT, Options)) {
+      TargetABI(computeTargetABI(TT, Options)),
+      Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) {
+
+  // For the estimates, convergence is quadratic, so we essentially double the
+  // number of digits correct after every iteration. For both FRE and FRSQRTE,
+  // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
+  // this is 2^-14. IEEE float has 23 digits and double has 52 digits.
+  unsigned RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3,
+           RefinementSteps64 = RefinementSteps + 1;
+
+  this->Options.Reciprocals.setDefaults("sqrtf", true, RefinementSteps);
+  this->Options.Reciprocals.setDefaults("vec-sqrtf", true, RefinementSteps);
+  this->Options.Reciprocals.setDefaults("divf", true, RefinementSteps);
+  this->Options.Reciprocals.setDefaults("vec-divf", true, RefinementSteps);
+
+  this->Options.Reciprocals.setDefaults("sqrtd", true, RefinementSteps64);
+  this->Options.Reciprocals.setDefaults("vec-sqrtd", true, RefinementSteps64);
+  this->Options.Reciprocals.setDefaults("divd", true, RefinementSteps64);
+  this->Options.Reciprocals.setDefaults("vec-divd", true, RefinementSteps64);
+
   initAsmInfo();
 }
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 5c0f7e629a69..6496339519a1 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -29,6 +29,8 @@ public:
 private:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   PPCABI TargetABI;
+  PPCSubtarget Subtarget;
+
   mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap;
 
 public:
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 25d563a7d975..e21c2b77f4d7 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -317,7 +317,7 @@ unsigned PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                      unsigned Alignment,
                                      unsigned AddressSpace) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
          "Invalid Opcode");
 
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 35e7a1497c83..368bef93f0dd 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -38,7 +38,8 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
 
 public:
   explicit PPCTTIImpl(const PPCTargetMachine *TM, Function &F)
-      : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   PPCTTIImpl(const PPCTTIImpl &Arg)
@@ -46,18 +47,6 @@ public:
   PPCTTIImpl(PPCTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  PPCTTIImpl &operator=(const PPCTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  PPCTTIImpl &operator=(PPCTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   /// \name Scalar TTI Implementations
   /// @{
diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index f352fa647ace..58d3c3d3fa2e 100644
--- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -136,6 +136,16 @@ protected:
         // source of the copy, it must still be live here.  We can't use
         // interval testing for a physical register, so as long as we're
         // walking the MIs we may as well test liveness here.
+        //
+        // FIXME: There is a case that occurs in practice, like this:
+        //   %vreg9<def> = COPY %F1; VSSRC:%vreg9
+        //   ...
+        //   %vreg6<def> = COPY %vreg9; VSSRC:%vreg6,%vreg9
+        //   %vreg7<def> = COPY %vreg9; VSSRC:%vreg7,%vreg9
+        //   %vreg9<def,tied1> = XSMADDASP %vreg9<tied0>, %vreg1, %vreg4; VSSRC:
+        //   %vreg6<def,tied1> = XSMADDASP %vreg6<tied0>, %vreg1, %vreg2; VSSRC:
+        //   %vreg7<def,tied1> = XSMADDASP %vreg7<tied0>, %vreg1, %vreg3; VSSRC:
+        // which prevents an otherwise-profitable transformation.
         bool OtherUsers = false, KillsAddendSrc = false;
         for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
              J != JE; --J) {
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index e7ab71ac2106..3fb1dcc3d4af 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -80,6 +80,7 @@ struct PPCVSXSwapEntry {
   unsigned int IsSwap : 1;
   unsigned int MentionsPhysVR : 1;
   unsigned int IsSwappable : 1;
+  unsigned int MentionsPartialVR : 1;
   unsigned int SpecialHandling : 3;
   unsigned int WebRejected : 1;
   unsigned int WillRemove : 1;
@@ -91,7 +92,9 @@ enum SHValues {
   SH_INSERT,
   SH_NOSWAP_LD,
   SH_NOSWAP_ST,
-  SH_SPLAT
+  SH_SPLAT,
+  SH_XXPERMDI,
+  SH_COPYSCALAR
 };
 
 struct PPCVSXSwapRemoval : public MachineFunctionPass {
@@ -167,6 +170,21 @@ private:
             isRegInClass(Reg, &PPC::VRRCRegClass));
   }
 
+  // Return true iff the given register is a partial vector register.
+  bool isScalarVecReg(unsigned Reg) {
+    return (isRegInClass(Reg, &PPC::VSFRCRegClass) ||
+            isRegInClass(Reg, &PPC::VSSRCRegClass));
+  }
+
+  // Return true iff the given register mentions all or part of a
+  // vector register.  Also sets Partial to true if the mention
+  // is for just the floating-point register overlap of the register.
+  bool isAnyVecReg(unsigned Reg, bool &Partial) {
+    if (isScalarVecReg(Reg))
+      Partial = true;
+    return isScalarVecReg(Reg) || isVecReg(Reg);
+  }
+
 public:
   // Main entry point for this pass.
   bool runOnMachineFunction(MachineFunction &MF) override {
@@ -223,12 +241,13 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
     for (MachineInstr &MI : MBB) {
 
       bool RelevantInstr = false;
+      bool Partial = false;
 
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isReg())
           continue;
         unsigned Reg = MO.getReg();
-        if (isVecReg(Reg)) {
+        if (isAnyVecReg(Reg, Partial)) {
           RelevantInstr = true;
           break;
         }
@@ -250,8 +269,13 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
         // Unless noted otherwise, an instruction is considered
         // safe for the optimization.  There are a large number of
         // such true-SIMD instructions (all vector math, logical,
-        // select, compare, etc.).
-        SwapVector[VecIdx].IsSwappable = 1;
+        // select, compare, etc.).  However, if the instruction
+        // mentions a partial vector register and does not have
+        // special handling defined, it is not swappable.
+        if (Partial)
+          SwapVector[VecIdx].MentionsPartialVR = 1;
+        else
+          SwapVector[VecIdx].IsSwappable = 1;
         break;
       case PPC::XXPERMDI: {
         // This is a swap if it is of the form XXPERMDI t, s, s, 2.
@@ -269,25 +293,37 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
                                                VecIdx);
           if (trueReg1 == trueReg2)
             SwapVector[VecIdx].IsSwap = 1;
-        }
+          else {
+            // We can still handle these if the two registers are not
+            // identical, by adjusting the form of the XXPERMDI.
+            SwapVector[VecIdx].IsSwappable = 1;
+            SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
+          }
         // This is a doubleword splat if it is of the form
         // XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3.  As above we
         // must look through chains of copy-likes to find the source
         // register.  We turn off the marking for mention of a physical
         // register, because splatting it is safe; the optimization
-        // will not swap the value in the physical register.
-        else if (immed == 0 || immed == 3) {
+        // will not swap the value in the physical register.  Whether
+        // or not the two input registers are identical, we can handle
+        // these by adjusting the form of the XXPERMDI.
+        } else if (immed == 0 || immed == 3) {
+
+          SwapVector[VecIdx].IsSwappable = 1;
+          SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
+
           unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(),
                                                VecIdx);
           unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(),
                                                VecIdx);
-          if (trueReg1 == trueReg2) {
-            SwapVector[VecIdx].IsSwappable = 1;
+          if (trueReg1 == trueReg2)
             SwapVector[VecIdx].MentionsPhysVR = 0;
-          }
+
+        } else {
+          // We can still handle these by adjusting the form of the XXPERMDI.
+          SwapVector[VecIdx].IsSwappable = 1;
+          SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
         }
-        // Any other form of XXPERMDI is lane-sensitive and unsafe
-        // for the optimization.
         break;
       }
       case PPC::LVX:
@@ -324,7 +360,32 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
         if (isVecReg(MI.getOperand(0).getReg()) &&
             isVecReg(MI.getOperand(1).getReg()))
           SwapVector[VecIdx].IsSwappable = 1;
+        // If we have a copy from one scalar floating-point register
+        // to another, we can accept this even if it is a physical
+        // register.  The only way this gets involved is if it feeds
+        // a SUBREG_TO_REG, which is handled by introducing a swap.
+        else if (isScalarVecReg(MI.getOperand(0).getReg()) &&
+                 isScalarVecReg(MI.getOperand(1).getReg()))
+          SwapVector[VecIdx].IsSwappable = 1;
+        break;
+      case PPC::SUBREG_TO_REG: {
+        // These are fine provided they are moving between full vector
+        // register classes.  If they are moving from a scalar
+        // floating-point class to a vector class, we can handle those
+        // as well, provided we introduce a swap.  It is generally the
+        // case that we will introduce fewer swaps than we remove, but
+        // (FIXME) a cost model could be used.  However, introduced
+        // swaps could potentially be CSEd, so this is not trivial.
+        if (isVecReg(MI.getOperand(0).getReg()) &&
+            isVecReg(MI.getOperand(2).getReg()))
+          SwapVector[VecIdx].IsSwappable = 1;
+        else if (isVecReg(MI.getOperand(0).getReg()) &&
+                 isScalarVecReg(MI.getOperand(2).getReg())) {
+          SwapVector[VecIdx].IsSwappable = 1;
+          SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYSCALAR;
+        }
         break;
+      }
       case PPC::VSPLTB:
       case PPC::VSPLTH:
       case PPC::VSPLTW:
@@ -425,6 +486,10 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
       case PPC::VUPKLSW:
       case PPC::XXMRGHW:
       case PPC::XXMRGLW:
+      // XXSLDWI could be replaced by a general permute with one of three
+      // permute control vectors (for shift values 1, 2, 3).  However,
+      // VPERM has a more restrictive register class.
+      case PPC::XXSLDWI:
       case PPC::XXSPLTW:
         break;
       }
@@ -501,18 +566,20 @@ void PPCVSXSwapRemoval::formWebs() {
     DEBUG(MI->dump());
 
     // It's sufficient to walk vector uses and join them to their unique
-    // definitions.  In addition, check *all* vector register operands
-    // for physical regs.
+    // definitions.  In addition, check full vector register operands
+    // for physical regs.  We exclude partial-vector register operands
+    // because we can handle them if copied to a full vector.
     for (const MachineOperand &MO : MI->operands()) {
       if (!MO.isReg())
         continue;
 
       unsigned Reg = MO.getReg();
-      if (!isVecReg(Reg))
+      if (!isVecReg(Reg) && !isScalarVecReg(Reg))
         continue;
 
       if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
-        SwapVector[EntryIdx].MentionsPhysVR = 1;
+        if (!(MI->isCopy() && isScalarVecReg(Reg)))
+          SwapVector[EntryIdx].MentionsPhysVR = 1;
         continue;
       }
 
@@ -545,15 +612,21 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
   for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
     int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
 
-    // Reject webs containing mentions of physical registers, or containing
-    // operations that we don't know how to handle in a lane-permuted region.
+    // If representative is already rejected, don't waste further time.
+    if (SwapVector[Repr].WebRejected)
+      continue;
+
+    // Reject webs containing mentions of physical or partial registers, or
+    // containing operations that we don't know how to handle in a lane-
+    // permuted region.
     if (SwapVector[EntryIdx].MentionsPhysVR ||
+        SwapVector[EntryIdx].MentionsPartialVR ||
         !(SwapVector[EntryIdx].IsSwappable || SwapVector[EntryIdx].IsSwap)) {
 
       SwapVector[Repr].WebRejected = 1;
 
       DEBUG(dbgs() <<
-            format("Web %d rejected for physreg, subreg, or not swap[pable]\n",
+            format("Web %d rejected for physreg, partial reg, or not swap[pable]\n",
                    Repr));
       DEBUG(dbgs() << "  in " << EntryIdx << ": ");
       DEBUG(SwapVector[EntryIdx].VSEMI->dump());
@@ -588,7 +661,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
         }
       }
 
-    // Reject webs than contain swapping stores that are fed by something
+    // Reject webs that contain swapping stores that are fed by something
     // other than a swap instruction.
     } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
       MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
@@ -670,7 +743,8 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
 // The identified swap entry requires special handling to allow its
 // containing computation to be optimized.  Perform that handling
 // here.
-// FIXME: This code is to be phased in with subsequent patches.
+// FIXME: Additional opportunities will be phased in with subsequent
+// patches.
 void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
   switch (SwapVector[EntryIdx].SpecialHandling) {
 
@@ -704,6 +778,91 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
     break;
   }
 
+  // For an XXPERMDI that isn't handled otherwise, we need to
+  // reverse the order of the operands.  If the selector operand
+  // has a value of 0 or 3, we need to change it to 3 or 0,
+  // respectively.  Otherwise we should leave it alone.  (This
+  // is equivalent to reversing the two bits of the selector
+  // operand and complementing the result.)
+  case SHValues::SH_XXPERMDI: {
+    MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+
+    DEBUG(dbgs() << "Changing XXPERMDI: ");
+    DEBUG(MI->dump());
+
+    unsigned Selector = MI->getOperand(3).getImm();
+    if (Selector == 0 || Selector == 3)
+      Selector = 3 - Selector;
+    MI->getOperand(3).setImm(Selector);
+
+    unsigned Reg1 = MI->getOperand(1).getReg();
+    unsigned Reg2 = MI->getOperand(2).getReg();
+    MI->getOperand(1).setReg(Reg2);
+    MI->getOperand(2).setReg(Reg1);
+
+    DEBUG(dbgs() << "  Into: ");
+    DEBUG(MI->dump());
+    break;
+  }
+
+  // For a copy from a scalar floating-point register to a vector
+  // register, removing swaps will leave the copied value in the
+  // wrong lane.  Insert a swap following the copy to fix this.
+  case SHValues::SH_COPYSCALAR: {
+    MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+
+    DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
+    DEBUG(MI->dump());
+
+    unsigned DstReg = MI->getOperand(0).getReg();
+    const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+    unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+
+    MI->getOperand(0).setReg(NewVReg);
+    DEBUG(dbgs() << "  Into: ");
+    DEBUG(MI->dump());
+
+    MachineBasicBlock::iterator InsertPoint = MI->getNextNode();
+
+    // Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
+    // is copying to a VRRC, we need to be careful to avoid a register
+    // assignment problem.  In this case we must copy from VRRC to VSRC
+    // prior to the swap, and from VSRC to VRRC following the swap.
+    // Coalescing will usually remove all this mess.
+
+    if (DstRC == &PPC::VRRCRegClass) {
+      unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
+      unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
+
+      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+              TII->get(PPC::COPY), VSRCTmp1)
+        .addReg(NewVReg);
+      DEBUG(MI->getNextNode()->dump());
+
+      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+              TII->get(PPC::XXPERMDI), VSRCTmp2)
+        .addReg(VSRCTmp1)
+        .addReg(VSRCTmp1)
+        .addImm(2);
+      DEBUG(MI->getNextNode()->getNextNode()->dump());
+
+      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+              TII->get(PPC::COPY), DstReg)
+        .addReg(VSRCTmp2);
+      DEBUG(MI->getNextNode()->getNextNode()->getNextNode()->dump());
+
+    } else {
+
+      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+              TII->get(PPC::XXPERMDI), DstReg)
+        .addReg(NewVReg)
+        .addReg(NewVReg)
+        .addImm(2);
+
+      DEBUG(MI->getNextNode()->dump());
+    }
+    break;
+  }
   }
 }
 
@@ -756,6 +915,8 @@ void PPCVSXSwapRemoval::dumpSwapVector() {
       DEBUG(dbgs() << "swap ");
     if (SwapVector[EntryIdx].MentionsPhysVR)
       DEBUG(dbgs() << "physreg ");
+    if (SwapVector[EntryIdx].MentionsPartialVR)
+      DEBUG(dbgs() << "partialreg ");
 
     if (SwapVector[EntryIdx].IsSwappable) {
       DEBUG(dbgs() << "swappable ");
@@ -780,6 +941,12 @@ void PPCVSXSwapRemoval::dumpSwapVector() {
       case SH_SPLAT:
         DEBUG(dbgs() << "special:splat ");
         break;
+      case SH_XXPERMDI:
+        DEBUG(dbgs() << "special:xxpermdi ");
+        break;
+      case SH_COPYSCALAR:
+        DEBUG(dbgs() << "special:copyscalar ");
+        break;
       }
     }
 
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index c486411f9a1e..5b7bfdd28020 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -22,7 +22,6 @@ add_llvm_target(SparcCodeGen
   SparcRegisterInfo.cpp
   SparcSubtarget.cpp
   SparcTargetMachine.cpp
-  SparcSelectionDAGInfo.cpp
   SparcMCInstLower.cpp
   SparcTargetObjectFile.cpp
   )
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index 91d2eeef0cc0..9113e4a46b96 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -57,7 +57,7 @@ static MCInstrInfo *createSparcMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createSparcMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createSparcMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitSparcMCRegisterInfo(X, SP::O7);
   return X;
@@ -65,11 +65,9 @@ static MCRegisterInfo *createSparcMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
   if (CPU.empty())
     CPU = (TT.getArch() == Triple::sparcv9) ? "v9" : "v8";
-  InitSparcMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createSparcMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
 // Code models. Some only make sense for 64-bit code.
@@ -83,7 +81,8 @@ createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
 //
 // All code models require that the text segment is smaller than 2GB.
 
-static MCCodeGenInfo *createSparcMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createSparcMCCodeGenInfo(const Triple &TT,
+                                               Reloc::Model RM,
                                                CodeModel::Model CM,
                                                CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
@@ -100,7 +99,8 @@ static MCCodeGenInfo *createSparcMCCodeGenInfo(StringRef TT, Reloc::Model RM,
   return X;
 }
 
-static MCCodeGenInfo *createSparcV9MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createSparcV9MCCodeGenInfo(const Triple &TT,
+                                                 Reloc::Model RM,
                                                  CodeModel::Model CM,
                                                  CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index bccc6bdd53eb..8fa10dcae114 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -190,11 +190,11 @@ static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
 {
 
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
-    if (MRI->isPhysRegUsed(reg))
+    if (!MRI->reg_nodbg_empty(reg))
       return false;
 
   for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
-    if (MRI->isPhysRegUsed(reg))
+    if (!MRI->reg_nodbg_empty(reg))
       return false;
 
   return true;
@@ -206,10 +206,10 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineFrameInfo    *MFI = MF.getFrameInfo();
 
-  return !(MFI->hasCalls()              // has calls
-           || MRI.isPhysRegUsed(SP::L0) // Too many registers needed
-           || MRI.isPhysRegUsed(SP::O6) // %SP is used
-           || hasFP(MF));               // need %FP
+  return !(MFI->hasCalls()                 // has calls
+           || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed
+           || !MRI.reg_nodbg_empty(SP::O6) // %SP is used
+           || hasFP(MF));                  // need %FP
 }
 
 void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
@@ -218,16 +218,13 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
 
   // Remap %i[0-7] to %o[0-7].
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
-    if (!MRI.isPhysRegUsed(reg))
+    if (MRI.reg_nodbg_empty(reg))
       continue;
     unsigned mapped_reg = (reg - SP::I0 + SP::O0);
-    assert(!MRI.isPhysRegUsed(mapped_reg));
+    assert(MRI.reg_nodbg_empty(mapped_reg));
 
     // Replace I register with O register.
     MRI.replaceRegWith(reg, mapped_reg);
-
-    // Mark the reg unused.
-    MRI.setPhysRegUnused(reg);
   }
 
   // Rewrite MBB's Live-ins.
@@ -247,9 +244,10 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
 #endif
 }
 
-void SparcFrameLowering::processFunctionBeforeCalleeSavedScan
-                  (MachineFunction &MF, RegScavenger *RS) const {
-
+void SparcFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                              BitVector &SavedRegs,
+                                              RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   if (!DisableLeafProc && isLeafProc(MF)) {
     SparcMachineFunctionInfo *MFI = MF.getInfo<SparcMachineFunctionInfo>();
     MFI->setLeafProc(true);
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index bb3b78861cbd..29fc7b7ba036 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -36,8 +36,8 @@ public:
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   bool hasFP(const MachineFunction &MF) const override;
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS = nullptr) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
 
 private:
   // Remap input registers to output registers for leaf procedure.
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index 9c594a9f0f65..340b72e7940f 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -67,13 +67,16 @@ private:
 
 SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = Subtarget->getInstrInfo()->getGlobalBaseReg(MF);
-  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode();
+  return CurDAG->getRegister(GlobalBaseReg,
+                             TLI->getPointerTy(CurDAG->getDataLayout()))
+      .getNode();
 }
 
 bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
                                      SDValue &Base, SDValue &Offset) {
   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), TLI->getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(
+        FIN->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout()));
     Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
     return true;
   }
@@ -88,8 +91,8 @@ bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
         if (FrameIndexSDNode *FIN =
                 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
           // Constant offset from frame ref.
-          Base =
-              CurDAG->getTargetFrameIndex(FIN->getIndex(), TLI->getPointerTy());
+          Base = CurDAG->getTargetFrameIndex(
+              FIN->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout()));
         } else {
           Base = Addr.getOperand(0);
         }
@@ -134,7 +137,7 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
   }
 
   R1 = Addr;
-  R2 = CurDAG->getRegister(SP::G0, TLI->getPointerTy());
+  R2 = CurDAG->getRegister(SP::G0, TLI->getPointerTy(CurDAG->getDataLayout()));
   return true;
 }
 
@@ -168,10 +171,9 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
     } else {
       TopPart = CurDAG->getRegister(SP::G0, MVT::i32);
     }
-    TopPart = SDValue(CurDAG->getMachineNode(SP::WRASRrr, dl, MVT::i32,
-                                 TopPart,
-                                 CurDAG->getRegister(SP::G0, MVT::i32)), 0);
-    TopPart = CurDAG->getCopyToReg(TopPart, dl, SP::Y, TopPart, SDValue()).getValue(1);
+    TopPart = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SP::Y, TopPart,
+                                   SDValue())
+                  .getValue(1);
 
     // FIXME: Handle div by immediate.
     unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr;
@@ -184,12 +186,11 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
     SDValue MulLHS = N->getOperand(0);
     SDValue MulRHS = N->getOperand(1);
     unsigned Opcode = N->getOpcode() == ISD::MULHU ? SP::UMULrr : SP::SMULrr;
-    SDNode *Mul = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Glue,
-                                         MulLHS, MulRHS);
-    // The high part is in the Y register.
-    return CurDAG->SelectNodeTo(N, SP::RDASR, MVT::i32,
-                                CurDAG->getRegister(SP::Y, MVT::i32),
-                                SDValue(Mul, 1));
+    SDNode *Mul =
+        CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32, MulLHS, MulRHS);
+    SDValue ResultHigh = SDValue(Mul, 1);
+    ReplaceUses(SDValue(N, 0), ResultHigh);
+    return nullptr;
   }
   }
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 0481676dc1ac..4879d4ee79e5 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -221,10 +221,11 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain,
     unsigned Reg = SFI->getSRetReturnReg();
     if (!Reg)
       llvm_unreachable("sret virtual register not created in the entry block");
-    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
+    auto PtrVT = getPointerTy(DAG.getDataLayout());
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, PtrVT);
     Chain = DAG.getCopyToReg(Chain, DL, SP::I0, Val, Flag);
     Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(SP::I0, getPointerTy()));
+    RetOps.push_back(DAG.getRegister(SP::I0, PtrVT));
     RetAddrOffset = 12; // CallInst + Delay Slot + Unimp
   }
 
@@ -418,6 +419,7 @@ LowerFormalArguments_32(SDValue Chain,
     assert(VA.isMemLoc());
 
     unsigned Offset = VA.getLocMemOffset()+StackOffset;
+    auto PtrVT = getPointerTy(DAG.getDataLayout());
 
     if (VA.needsCustom()) {
       assert(VA.getValVT() == MVT::f64);
@@ -426,7 +428,7 @@ LowerFormalArguments_32(SDValue Chain,
         int FI = MF.getFrameInfo()->CreateFixedObject(8,
                                                       Offset,
                                                       true);
-        SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+        SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
         SDValue Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr,
                                    MachinePointerInfo(),
                                    false,false, false, 0);
@@ -437,14 +439,14 @@ LowerFormalArguments_32(SDValue Chain,
       int FI = MF.getFrameInfo()->CreateFixedObject(4,
                                                     Offset,
                                                     true);
-      SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
       SDValue HiVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
                                   MachinePointerInfo(),
                                   false, false, false, 0);
       int FI2 = MF.getFrameInfo()->CreateFixedObject(4,
                                                      Offset+4,
                                                      true);
-      SDValue FIPtr2 = DAG.getFrameIndex(FI2, getPointerTy());
+      SDValue FIPtr2 = DAG.getFrameIndex(FI2, PtrVT);
 
       SDValue LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr2,
                                   MachinePointerInfo(),
@@ -460,7 +462,7 @@ LowerFormalArguments_32(SDValue Chain,
     int FI = MF.getFrameInfo()->CreateFixedObject(4,
                                                   Offset,
                                                   true);
-    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
     SDValue Load ;
     if (VA.getValVT() == MVT::i32 || VA.getValVT() == MVT::f32) {
       Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr,
@@ -607,10 +609,10 @@ LowerFormalArguments_64(SDValue Chain,
     if (VA.isExtInLoc())
       Offset += 8 - ValSize;
     int FI = MF.getFrameInfo()->CreateFixedObject(ValSize, Offset, true);
-    InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain,
-                                 DAG.getFrameIndex(FI, getPointerTy()),
-                                 MachinePointerInfo::getFixedStack(FI),
-                                 false, false, false, 0));
+    InVals.push_back(DAG.getLoad(
+        VA.getValVT(), DL, Chain,
+        DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
+        MachinePointerInfo::getFixedStack(FI), false, false, false, 0));
   }
 
   if (!IsVarArg)
@@ -637,10 +639,10 @@ LowerFormalArguments_64(SDValue Chain,
     unsigned VReg = MF.addLiveIn(SP::I0 + ArgOffset/8, &SP::I64RegsRegClass);
     SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
     int FI = MF.getFrameInfo()->CreateFixedObject(8, ArgOffset + ArgArea, true);
-    OutChains.push_back(DAG.getStore(Chain, DL, VArg,
-                                     DAG.getFrameIndex(FI, getPointerTy()),
-                                     MachinePointerInfo::getFixedStack(FI),
-                                     false, false, 0));
+    auto PtrVT = getPointerTy(MF.getDataLayout());
+    OutChains.push_back(
+        DAG.getStore(Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT),
+                     MachinePointerInfo::getFixedStack(FI), false, false, 0));
   }
 
   if (!OutChains.empty())
@@ -722,7 +724,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
     unsigned Align = Flags.getByValAlign();
 
     int FI = MFI->CreateStackObject(Size, Align, false);
-    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     SDValue SizeNode = DAG.getConstant(Size, dl, MVT::i32);
 
     Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
@@ -993,7 +995,7 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
 
   PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
   Type *ElementTy = Ty->getElementType();
-  return getDataLayout()->getTypeAllocSize(ElementTy);
+  return DAG.getDataLayout().getTypeAllocSize(ElementTy);
 }
 
 
@@ -1057,6 +1059,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   SelectionDAG &DAG = CLI.DAG;
   SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Sparc target does not yet support tail call optimization.
   CLI.IsTailCall = false;
@@ -1130,13 +1133,11 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
         // Store and reload into the interger register reg and reg+1.
         unsigned Offset = 8 * (VA.getLocReg() - SP::I0);
         unsigned StackOffset = Offset + Subtarget->getStackPointerBias() + 128;
-        SDValue StackPtr = DAG.getRegister(SP::O6, getPointerTy());
+        SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT);
         SDValue HiPtrOff = DAG.getIntPtrConstant(StackOffset, DL);
-        HiPtrOff         = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
-                                       HiPtrOff);
+        HiPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, HiPtrOff);
         SDValue LoPtrOff = DAG.getIntPtrConstant(StackOffset + 8, DL);
-        LoPtrOff         = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
-                                       LoPtrOff);
+        LoPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, LoPtrOff);
 
         // Store to %sp+BIAS+128+Offset
         SDValue Store = DAG.getStore(Chain, DL, Arg, HiPtrOff,
@@ -1180,13 +1181,13 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
     assert(VA.isMemLoc());
 
     // Create a store off the stack pointer for this argument.
-    SDValue StackPtr = DAG.getRegister(SP::O6, getPointerTy());
+    SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT);
     // The argument area starts at %fp+BIAS+128 in the callee frame,
     // %sp+BIAS+128 in ours.
     SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() +
                                            Subtarget->getStackPointerBias() +
                                            128, DL);
-    PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+    PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
     MemOpChains.push_back(DAG.getStore(Chain, DL, Arg, PtrOff,
                                        MachinePointerInfo(),
                                        false, false, 0));
@@ -1215,10 +1216,9 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   unsigned TF = ((getTargetMachine().getRelocationModel() == Reloc::PIC_)
                  ? SparcMCExpr::VK_Sparc_WPLT30 : 0);
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
-                                        TF);
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT, 0, TF);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
-    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy(), TF);
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, TF);
 
   // Build the operands for the call instruction itself.
   SmallVector<SDValue, 8> Ops;
@@ -1370,6 +1370,8 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
 SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
                                          const SparcSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
+  auto &DL = *TM.getDataLayout();
+
   // Set up the register classes.
   addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
   addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
@@ -1394,10 +1396,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
 
   // Custom legalize GlobalAddress nodes into LO/HI parts.
-  setOperationAction(ISD::GlobalAddress, getPointerTy(), Custom);
-  setOperationAction(ISD::GlobalTLSAddress, getPointerTy(), Custom);
-  setOperationAction(ISD::ConstantPool, getPointerTy(), Custom);
-  setOperationAction(ISD::BlockAddress, getPointerTy(), Custom);
+  setOperationAction(ISD::GlobalAddress, getPointerTy(DL), Custom);
+  setOperationAction(ISD::GlobalTLSAddress, getPointerTy(DL), Custom);
+  setOperationAction(ISD::ConstantPool, getPointerTy(DL), Custom);
+  setOperationAction(ISD::BlockAddress, getPointerTy(DL), Custom);
 
   // Sparc doesn't have sext_inreg, replace them with shl/sra
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
@@ -1704,7 +1706,8 @@ const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
 }
 
-EVT SparcTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT SparcTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+                                            EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
@@ -1804,7 +1807,7 @@ SDValue SparcTargetLowering::makeHiLoPair(SDValue Op,
 // or ExternalSymbol SDNode.
 SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  EVT VT = getPointerTy();
+  EVT VT = getPointerTy(DAG.getDataLayout());
 
   // Handle PIC mode first.
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
@@ -1871,7 +1874,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   SDLoc DL(GA);
   const GlobalValue *GV = GA->getGlobal();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   TLSModel::Model model = getTargetMachine().getTLSModel(GV);
 
@@ -1983,7 +1986,7 @@ SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
   if (ArgTy->isFP128Ty()) {
     // Create a stack object and pass the pointer to the library function.
     int FI = MFI->CreateStackObject(16, 8, false);
-    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     Chain = DAG.getStore(Chain,
                          DL,
                          Entry.Node,
@@ -2008,8 +2011,9 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
   ArgListTy Args;
 
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  SDValue Callee = DAG.getExternalSymbol(LibFuncName, getPointerTy());
+  SDValue Callee = DAG.getExternalSymbol(LibFuncName, PtrVT);
   Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
   Type *RetTyABI = RetTy;
   SDValue Chain = DAG.getEntryNode();
@@ -2019,7 +2023,7 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
     // Create a Stack Object to receive the return value of type f128.
     ArgListEntry Entry;
     int RetFI = MFI->CreateStackObject(16, 8, false);
-    RetPtr = DAG.getFrameIndex(RetFI, getPointerTy());
+    RetPtr = DAG.getFrameIndex(RetFI, PtrVT);
     Entry.Node = RetPtr;
     Entry.Ty   = PointerType::getUnqual(RetTy);
     if (!Subtarget->is64Bit())
@@ -2082,7 +2086,8 @@ SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
   case SPCC::FCC_UE : LibCall = is64Bit? "_Qp_cmp" : "_Q_cmp"; break;
   }
 
-  SDValue Callee = DAG.getExternalSymbol(LibCall, getPointerTy());
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Callee = DAG.getExternalSymbol(LibCall, PtrVT);
   Type *RetTy = Type::getInt32Ty(*DAG.getContext());
   ArgListTy Args;
   SDValue Chain = DAG.getEntryNode();
@@ -2362,6 +2367,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
                             const SparcTargetLowering &TLI) {
   MachineFunction &MF = DAG.getMachineFunction();
   SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+  auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
 
   // Need frame address to find the address of VarArgsFrameIndex.
   MF.getFrameInfo()->setFrameAddressIsTaken(true);
@@ -2370,9 +2376,8 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
   // memory location argument.
   SDLoc DL(Op);
   SDValue Offset =
-    DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(),
-                DAG.getRegister(SP::I6, TLI.getPointerTy()),
-                DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
+      DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(SP::I6, PtrVT),
+                  DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
                       MachinePointerInfo(SV), false, false, 0);
@@ -2497,8 +2502,8 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
 
   SDValue RetAddr;
   if (depth == 0) {
-    unsigned RetReg = MF.addLiveIn(SP::I7,
-                                   TLI.getRegClassFor(TLI.getPointerTy()));
+    auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    unsigned RetReg = MF.addLiveIn(SP::I7, TLI.getRegClassFor(PtrVT));
     RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT);
     return RetAddr;
   }
@@ -3065,7 +3070,7 @@ SparcTargetLowering::expandAtomicRMW(MachineInstr *MI,
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 SparcTargetLowering::ConstraintType
-SparcTargetLowering::getConstraintType(const std::string &Constraint) const {
+SparcTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:  break;
@@ -3139,7 +3144,7 @@ LowerAsmOperandForConstraint(SDValue Op,
 
 std::pair<unsigned, const TargetRegisterClass *>
 SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                  const std::string &Constraint,
+                                                  StringRef Constraint,
                                                   MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index b6bc3d255713..bbc91a493c9d 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -72,7 +72,7 @@ namespace llvm {
 
     const char *getTargetNodeName(unsigned Opcode) const override;
 
-    ConstraintType getConstraintType(const std::string &Constraint) const override;
+    ConstraintType getConstraintType(StringRef Constraint) const override;
     ConstraintWeight
     getSingleConstraintMatchWeight(AsmOperandInfo &info,
                                    const char *constraint) const override;
@@ -82,14 +82,16 @@ namespace llvm {
                                       SelectionDAG &DAG) const override;
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
-    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i32;
+    }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
 
     SDValue
       LowerFormalArguments(SDValue Chain,
diff --git a/lib/Target/Sparc/SparcInstrAliases.td b/lib/Target/Sparc/SparcInstrAliases.td
index 670e9e989c81..25cc652dbd9e 100644
--- a/lib/Target/Sparc/SparcInstrAliases.td
+++ b/lib/Target/Sparc/SparcInstrAliases.td
@@ -245,6 +245,7 @@ multiclass fp_cond_alias<string cond, int condVal> {
 }
 
 defm : int_cond_alias<"a",    0b1000>;
+defm : int_cond_alias<"",     0b1000>; // same as a; gnu asm, not in manual
 defm : int_cond_alias<"n",    0b0000>;
 defm : int_cond_alias<"ne",   0b1001>;
 defm : int_cond_alias<"nz",   0b1001>; // same as ne
@@ -266,6 +267,7 @@ defm : int_cond_alias<"vc",   0b1111>;
 defm : int_cond_alias<"vs",   0b0111>;
 
 defm : fp_cond_alias<"a",     0b0000>;
+defm : fp_cond_alias<"",      0b0000>; // same as a; gnu asm, not in manual
 defm : fp_cond_alias<"n",     0b1000>;
 defm : fp_cond_alias<"u",     0b0111>;
 defm : fp_cond_alias<"g",     0b0110>;
@@ -284,7 +286,16 @@ defm : fp_cond_alias<"le",    0b1101>;
 defm : fp_cond_alias<"ule",   0b1110>;
 defm : fp_cond_alias<"o",     0b1111>;
 
-// Instruction aliases for JMPL.
+// Section A.3 Synthetic Instructions
+
+// Most are marked as Emit=0, so that they are not used for disassembly. This is
+// an aesthetic issue, but the chosen policy is to typically prefer using the
+// non-alias form, except for the most obvious and clarifying aliases: cmp, jmp,
+// call, tst, ret, retl.
+
+// Note: cmp is handled in SparcInstrInfo.
+//       jmp/call/ret/retl have special case handling for output in
+//       SparcInstPrinter.cpp
 
 // jmp addr -> jmpl addr, %g0
 def : InstAlias<"jmp $addr", (JMPLrr G0, MEMrr:$addr), 0>;
@@ -294,25 +305,129 @@ def : InstAlias<"jmp $addr", (JMPLri G0, MEMri:$addr), 0>;
 def : InstAlias<"call $addr", (JMPLrr O7, MEMrr:$addr), 0>;
 def : InstAlias<"call $addr", (JMPLri O7, MEMri:$addr), 0>;
 
-// retl -> RETL 8
-def : InstAlias<"retl", (RETL 8)>;
+// tst reg -> orcc %g0, reg, %g0
+def : InstAlias<"tst $rs2", (ORCCrr G0, IntRegs:$rs2, G0)>;
 
-// ret -> RET 8
+// ret -> jmpl %i7+8, %g0 (aka RET 8)
 def : InstAlias<"ret", (RET 8)>;
 
-// mov reg, rd -> or %g0, reg, rd
-def : InstAlias<"mov $rs2, $rd", (ORrr IntRegs:$rd, G0, IntRegs:$rs2)>;
+// retl -> jmpl %o7+8, %g0 (aka RETL 8)
+def : InstAlias<"retl", (RETL 8)>;
 
-// mov simm13, rd -> or %g0, simm13, rd
-def : InstAlias<"mov $simm13, $rd", (ORri IntRegs:$rd, G0, i32imm:$simm13)>;
+// restore -> restore %g0, %g0, %g0
+def : InstAlias<"restore", (RESTORErr G0, G0, G0)>;
+
+// save -> restore %g0, %g0, %g0
+def : InstAlias<"save", (SAVErr G0, G0, G0)>;
 
 // set value, rd
 // (turns into a sequence of sethi+or, depending on the value)
 // def : InstAlias<"set $val, $rd", (ORri IntRegs:$rd, (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
 def SET : AsmPseudoInst<(outs IntRegs:$rd), (ins i32imm:$val), "set $val, $rd">;
 
-// restore -> restore %g0, %g0, %g0
-def : InstAlias<"restore", (RESTORErr G0, G0, G0)>;
+// not rd -> xnor rd, %g0, rd
+def : InstAlias<"not $rd", (XNORrr IntRegs:$rd, IntRegs:$rd, G0), 0>;
+
+// not reg, rd -> xnor reg, %g0, rd
+def : InstAlias<"not $rs1, $rd", (XNORrr IntRegs:$rd, IntRegs:$rs1, G0), 0>;
+
+// neg rd -> sub %g0, rd, rd
+def : InstAlias<"neg $rd", (SUBrr IntRegs:$rd, G0, IntRegs:$rd), 0>;
+
+// neg reg, rd -> sub %g0, reg, rd
+def : InstAlias<"neg $rs2, $rd", (SUBrr IntRegs:$rd, G0, IntRegs:$rs2), 0>;
+
+// inc rd -> add rd, 1, rd
+def : InstAlias<"inc $rd", (ADDri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// inc simm13, rd -> add rd, simm13, rd
+def : InstAlias<"inc $simm13, $rd", (ADDri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// inccc rd -> addcc rd, 1, rd
+def : InstAlias<"inccc $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// inccc simm13, rd -> addcc rd, simm13, rd
+def : InstAlias<"inccc $simm13, $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// dec rd -> sub rd, 1, rd
+def : InstAlias<"dec $rd", (SUBri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// dec simm13, rd -> sub rd, simm13, rd
+def : InstAlias<"dec $simm13, $rd", (SUBri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// deccc rd -> subcc rd, 1, rd
+def : InstAlias<"deccc $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// deccc simm13, rd -> subcc rd, simm13, rd
+def : InstAlias<"deccc $simm13, $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// btst reg_or_imm, reg -> andcc reg,reg_or_imm,%g0
+def : InstAlias<"btst $rs2, $rs1", (ANDCCrr G0, IntRegs:$rs1, IntRegs:$rs2), 0>;
+def : InstAlias<"btst $simm13, $rs1", (ANDCCri G0, IntRegs:$rs1, i32imm:$simm13), 0>;
+
+// bset reg_or_imm, rd -> or rd,reg_or_imm,rd
+def : InstAlias<"bset $rs2, $rd", (ORrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
+def : InstAlias<"bset $simm13, $rd", (ORri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// bclr reg_or_imm, rd -> andn rd,reg_or_imm,rd
+def : InstAlias<"bclr $rs2, $rd", (ANDNrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
+def : InstAlias<"bclr $simm13, $rd", (ANDNri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// btog reg_or_imm, rd -> xor rd,reg_or_imm,rd
+def : InstAlias<"btog $rs2, $rd", (XORrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
+def : InstAlias<"btog $simm13, $rd", (XORri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+
+// clr rd -> or %g0, %g0, rd
+def : InstAlias<"clr $rd", (ORrr IntRegs:$rd, G0, G0), 0>;
+
+// clr{b,h,} [addr] -> st{b,h,} %g0, [addr]
+def : InstAlias<"clrb [$addr]", (STBrr MEMrr:$addr, G0), 0>;
+def : InstAlias<"clrb [$addr]", (STBri MEMri:$addr, G0), 0>;
+def : InstAlias<"clrh [$addr]", (STHrr MEMrr:$addr, G0), 0>;
+def : InstAlias<"clrh [$addr]", (STHri MEMri:$addr, G0), 0>;
+def : InstAlias<"clr [$addr]", (STrr MEMrr:$addr, G0), 0>;
+def : InstAlias<"clr [$addr]", (STri MEMri:$addr, G0), 0>;
+
+
+// mov reg_or_imm, rd -> or %g0, reg_or_imm, rd
+def : InstAlias<"mov $rs2, $rd", (ORrr IntRegs:$rd, G0, IntRegs:$rs2)>;
+def : InstAlias<"mov $simm13, $rd", (ORri IntRegs:$rd, G0, i32imm:$simm13)>;
+
+// mov specialreg, rd -> rd specialreg, rd
+def : InstAlias<"mov $asr, $rd", (RDASR IntRegs:$rd, ASRRegs:$asr), 0>;
+def : InstAlias<"mov %psr, $rd", (RDPSR IntRegs:$rd), 0>;
+def : InstAlias<"mov %wim, $rd", (RDWIM IntRegs:$rd), 0>;
+def : InstAlias<"mov %tbr, $rd", (RDTBR IntRegs:$rd), 0>;
+
+// mov reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg
+def : InstAlias<"mov $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, $asr", (WRASRri ASRRegs:$asr, G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $rs2, %psr", (WRPSRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, %psr", (WRPSRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $rs2, %wim", (WRWIMrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, %wim", (WRWIMri G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $rs2, %tbr", (WRTBRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
+
+// End of Section A.3
+
+// wr reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg
+// (aka: omit the first arg when it's g0. This is not in the manual, but is
+// supported by gnu and solaris as)
+def : InstAlias<"wr $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, $asr", (WRASRri ASRRegs:$asr, G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $rs2, %psr", (WRPSRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, %psr", (WRPSRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $rs2, %wim", (WRWIMrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, %wim", (WRWIMri G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $rs2, %tbr", (WRTBRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
+
+
+// flush -> flush %g0
+def : InstAlias<"flush", (FLUSH), 0>;
+
 
 def : MnemonicAlias<"return", "rett">, Requires<[HasV9]>;
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index f87cee43e319..6167c532db80 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -324,6 +324,15 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       numSubRegs = 4;
       movOpc     = SP::FMOVS;
     }
+  } else if (SP::ASRRegsRegClass.contains(DestReg) &&
+             SP::IntRegsRegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(SP::WRASRrr), DestReg)
+        .addReg(SP::G0)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  } else if (SP::IntRegsRegClass.contains(DestReg) &&
+             SP::ASRRegsRegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(SP::RDASR), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
   } else
     llvm_unreachable("Impossible reg-to-reg copy");
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index a02bae07a336..3b9e048ea8b3 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -536,6 +536,7 @@ let Defs = [ICC] in
 let Uses = [ICC] in
   defm SUBC   : F3_12np <"subx", 0b001100>;
 
+// cmp (from Section A.3) is a specialized alias for subcc
 let Defs = [ICC], rd = 0 in {
   def CMPrr   : F3_1<2, 0b010100,
                      (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
@@ -559,12 +560,12 @@ let Defs = [Y, ICC] in {
 }
 
 // Section B.19 - Divide Instructions, p. 115
-let Defs = [Y] in {
+let Uses = [Y], Defs = [Y] in {
   defm UDIV : F3_12np<"udiv", 0b001110>;
   defm SDIV : F3_12np<"sdiv", 0b001111>;
 }
 
-let Defs = [Y, ICC] in {
+let Uses = [Y], Defs = [Y, ICC] in {
   defm UDIVCC : F3_12np<"udivcc", 0b011110>;
   defm SDIVCC : F3_12np<"sdivcc", 0b011111>;
 }
@@ -828,6 +829,20 @@ let rd = 0 in
   def UNIMP : F2_1<0b000, (outs), (ins i32imm:$imm22),
                   "unimp $imm22", []>;
 
+// Section B.32 - Flush Instruction Memory
+let rd = 0 in {
+  def FLUSHrr : F3_1<2, 0b111011, (outs), (ins MEMrr:$addr),
+                       "flush $addr", []>;
+  def FLUSHri : F3_2<2, 0b111011, (outs), (ins MEMri:$addr),
+                       "flush $addr", []>;
+
+  // The no-arg FLUSH is only here for the benefit of the InstAlias
+  // "flush", which cannot seem to use FLUSHrr, due to the inability
+  // to construct a MEMrr with fixed G0 registers.
+  let rs1 = 0, rs2 = 0 in
+    def FLUSH   : F3_1<2, 0b111011, (outs), (ins), "flush %g0", []>;
+}
+
 // Section B.33 - Floating-point Operate (FPop) Instructions
 
 // Convert Integer to Floating-point Instructions, p. 141
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index e504da4d3b21..db8a7e86962d 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -249,4 +249,6 @@ def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
 
 // Ancillary state registers
 def ASRRegs : RegisterClass<"SP", [i32], 32,
-                            (add Y, (sequence "ASR%u", 1, 31))>;
+                            (add Y, (sequence "ASR%u", 1, 31))> {
+  let isAllocatable = 0;
+}
diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
deleted file mode 100644
index a308fc5e739e..000000000000
--- a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===-- SparcSelectionDAGInfo.cpp - Sparc SelectionDAG Info ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the SparcSelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "SparcSelectionDAGInfo.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "sparc-selectiondag-info"
-
-SparcSelectionDAGInfo::SparcSelectionDAGInfo(const DataLayout &DL)
-  : TargetSelectionDAGInfo(&DL) {
-}
-
-SparcSelectionDAGInfo::~SparcSelectionDAGInfo() {
-}
diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.h b/lib/Target/Sparc/SparcSelectionDAGInfo.h
deleted file mode 100644
index 6818291b30b4..000000000000
--- a/lib/Target/Sparc/SparcSelectionDAGInfo.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- SparcSelectionDAGInfo.h - Sparc SelectionDAG Info -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the Sparc subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_SPARC_SPARCSELECTIONDAGINFO_H
-#define LLVM_LIB_TARGET_SPARC_SPARCSELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class SparcTargetMachine;
-
-class SparcSelectionDAGInfo : public TargetSelectionDAGInfo {
-public:
-  explicit SparcSelectionDAGInfo(const DataLayout &DL);
-  ~SparcSelectionDAGInfo() override;
-};
-
-}
-
-#endif
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index 479b25d2723f..d69da409e428 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -54,7 +54,7 @@ SparcSubtarget::SparcSubtarget(const Triple &TT, const std::string &CPU,
                                bool is64Bit)
     : SparcGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      TSInfo(*TM.getDataLayout()), FrameLowering(*this) {}
+      FrameLowering(*this) {}
 
 int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
 
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index 983b1193975d..9d21911d88f0 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -17,9 +17,9 @@
 #include "SparcFrameLowering.h"
 #include "SparcInstrInfo.h"
 #include "SparcISelLowering.h"
-#include "SparcSelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -39,7 +39,7 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
   bool UsePopc;
   SparcInstrInfo InstrInfo;
   SparcTargetLowering TLInfo;
-  SparcSelectionDAGInfo TSInfo;
+  TargetSelectionDAGInfo TSInfo;
   SparcFrameLowering FrameLowering;
 
 public:
@@ -56,7 +56,7 @@ public:
   const SparcTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const SparcSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 81882106fc46..5fefa315a4cf 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -148,7 +148,7 @@ static MCInstrInfo *createSystemZMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createSystemZMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createSystemZMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitSystemZMCRegisterInfo(X, SystemZ::R14D);
   return X;
@@ -156,12 +156,11 @@ static MCRegisterInfo *createSystemZMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createSystemZMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitSystemZMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createSystemZMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createSystemZMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createSystemZMCCodeGenInfo(const Triple &TT,
+                                                 Reloc::Model RM,
                                                  CodeModel::Model CM,
                                                  CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index a636b35635ce..397de472a6ee 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -61,11 +61,12 @@ SystemZFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
   return SpillOffsetTable;
 }
 
-void SystemZFrameLowering::
-processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS) const {
+void SystemZFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                BitVector &SavedRegs,
+                                                RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
   MachineFrameInfo *MFFrame = MF.getFrameInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   bool HasFP = hasFP(MF);
   SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
@@ -77,17 +78,17 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // argument register R6D.
   if (IsVarArg)
     for (unsigned I = MFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
-      MRI.setPhysRegUsed(SystemZ::ArgGPRs[I]);
+      SavedRegs.set(SystemZ::ArgGPRs[I]);
 
   // If the function requires a frame pointer, record that the hard
   // frame pointer will be clobbered.
   if (HasFP)
-    MRI.setPhysRegUsed(SystemZ::R11D);
+    SavedRegs.set(SystemZ::R11D);
 
   // If the function calls other functions, record that the return
   // address register will be clobbered.
   if (MFFrame->hasCalls())
-    MRI.setPhysRegUsed(SystemZ::R14D);
+    SavedRegs.set(SystemZ::R14D);
 
   // If we are saving GPRs other than the stack pointer, we might as well
   // save and restore the stack pointer at the same time, via STMG and LMG.
@@ -96,8 +97,8 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
   for (unsigned I = 0; CSRegs[I]; ++I) {
     unsigned Reg = CSRegs[I];
-    if (SystemZ::GR64BitRegClass.contains(Reg) && MRI.isPhysRegUsed(Reg)) {
-      MRI.setPhysRegUsed(SystemZ::R15D);
+    if (SystemZ::GR64BitRegClass.contains(Reg) && SavedRegs.test(Reg)) {
+      SavedRegs.set(SystemZ::R15D);
       break;
     }
   }
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index 60bad894ee44..5ade757f17f7 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -27,8 +27,8 @@ public:
   bool isFPCloseToIncomingSP() const override { return false; }
   const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const
     override;
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
                                  const std::vector<CalleeSavedInfo> &CSI,
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 372f6fb3ea50..056ee02dcc21 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -81,10 +81,11 @@ static MachineOperand earlyUseOperand(MachineOperand Op) {
   return Op;
 }
 
-SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
+SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
                                              const SystemZSubtarget &STI)
-    : TargetLowering(tm), Subtarget(STI) {
-  MVT PtrVT = getPointerTy();
+    : TargetLowering(TM), Subtarget(STI) {
+  auto &DL = *TM.getDataLayout();
+  MVT PtrVT = getPointerTy(DL);
 
   // Set up the register classes.
   if (Subtarget.hasHighWord())
@@ -455,7 +456,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
   MaxStoresPerMemsetOptSize = 0;
 }
 
-EVT SystemZTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
+                                              LLVMContext &, EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
@@ -507,8 +509,8 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   return true;
 }
 
-bool SystemZTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                  Type *Ty,
+bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                  const AddrMode &AM, Type *Ty,
                                                   unsigned AS) const {
   // Punt on globals for now, although they can be used in limited
   // RELATIVE LONG cases.
@@ -544,7 +546,7 @@ bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
 //===----------------------------------------------------------------------===//
 
 TargetLowering::ConstraintType
-SystemZTargetLowering::getConstraintType(const std::string &Constraint) const {
+SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'a': // Address register
@@ -641,13 +643,14 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
 // has already been verified.  MC is the class associated with "t" and
 // Map maps 0-based register numbers to LLVM register numbers.
 static std::pair<unsigned, const TargetRegisterClass *>
-parseRegisterNumber(const std::string &Constraint,
-                    const TargetRegisterClass *RC, const unsigned *Map) {
+parseRegisterNumber(StringRef Constraint, const TargetRegisterClass *RC,
+                    const unsigned *Map) {
   assert(*(Constraint.end()-1) == '}' && "Missing '}'");
   if (isdigit(Constraint[2])) {
-    std::string Suffix(Constraint.data() + 2, Constraint.size() - 2);
-    unsigned Index = atoi(Suffix.c_str());
-    if (Index < 16 && Map[Index])
+    unsigned Index;
+    bool Failed =
+        Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index);
+    if (!Failed && Index < 16 && Map[Index])
       return std::make_pair(Map[Index], RC);
   }
   return std::make_pair(0U, nullptr);
@@ -655,8 +658,7 @@ parseRegisterNumber(const std::string &Constraint,
 
 std::pair<unsigned, const TargetRegisterClass *>
 SystemZTargetLowering::getRegForInlineAsmConstraint(
-    const TargetRegisterInfo *TRI, const std::string &Constraint,
-    MVT VT) const {
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC Constraint Letters
     switch (Constraint[0]) {
@@ -687,7 +689,7 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
       return std::make_pair(0U, &SystemZ::FP32BitRegClass);
     }
   }
-  if (Constraint[0] == '{') {
+  if (Constraint.size() > 0 && Constraint[0] == '{') {
     // We need to override the default register parsing for GPRs and FPRs
     // because the interpretation depends on VT.  The internal names of
     // the registers are also different from the external names
@@ -931,7 +933,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
       // Create the SelectionDAG nodes corresponding to a load
       // from this parameter.  Unpromoted ints and floats are
       // passed as right-justified 8-byte values.
-      EVT PtrVT = getPointerTy();
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
@@ -969,7 +971,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
       for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
         unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]);
         int FI = MFI->CreateFixedObject(8, RegSaveOffset + Offset, true);
-        SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+        SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
         unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
                                      &SystemZ::FP64BitRegClass);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
@@ -1019,7 +1021,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   CallingConv::ID CallConv = CLI.CallConv;
   bool IsVarArg = CLI.IsVarArg;
   MachineFunction &MF = DAG.getMachineFunction();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   // Detect unsupported vector argument and return types.
   if (Subtarget.hasVector()) {
@@ -2401,7 +2403,7 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
   SDLoc DL(Node);
   const GlobalValue *GV = Node->getGlobal();
   int64_t Offset = Node->getOffset();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   Reloc::Model RM = DAG.getTarget().getRelocationModel();
   CodeModel::Model CM = DAG.getTarget().getCodeModel();
 
@@ -2440,7 +2442,7 @@ SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
                                                  unsigned Opcode,
                                                  SDValue GOTOffset) const {
   SDLoc DL(Node);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Chain = DAG.getEntryNode();
   SDValue Glue;
 
@@ -2486,7 +2488,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
 						     SelectionDAG &DAG) const {
   SDLoc DL(Node);
   const GlobalValue *GV = Node->getGlobal();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
 
   // The high part of the thread pointer is in access register 0.
@@ -2587,7 +2589,7 @@ SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
   SDLoc DL(Node);
   const BlockAddress *BA = Node->getBlockAddress();
   int64_t Offset = Node->getOffset();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset);
   Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
@@ -2597,7 +2599,7 @@ SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
 SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
                                               SelectionDAG &DAG) const {
   SDLoc DL(JT);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
 
   // Use LARL to load the address of the table.
@@ -2607,7 +2609,7 @@ SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
 SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
                                                  SelectionDAG &DAG) const {
   SDLoc DL(CP);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   SDValue Result;
   if (CP->isMachineConstantPoolEntry())
@@ -2671,7 +2673,7 @@ SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   SystemZMachineFunctionInfo *FuncInfo =
     MF.getInfo<SystemZMachineFunctionInfo>();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   SDValue Chain   = Op.getOperand(0);
   SDValue Addr    = Op.getOperand(1);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 2f7617bbdac3..949b67f114ea 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -339,10 +339,10 @@ public:
                                  const SystemZSubtarget &STI);
 
   // Override TargetLowering.
-  MVT getScalarShiftAmountTy(EVT LHSTy) const override {
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
     return MVT::i32;
   }
-  MVT getVectorIdxTy() const override {
+  MVT getVectorIdxTy(const DataLayout &DL) const override {
     // Only the lower 12 bits of an element index are used, so we don't
     // want to clobber the upper 32 bits of a GPR unnecessarily.
     return MVT::i32;
@@ -364,12 +364,13 @@ public:
       return TypeWidenVector;
     return TargetLoweringBase::getPreferredVectorAction(VT);
   }
-  EVT getSetCCResultType(LLVMContext &, EVT) const override;
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                         EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
   bool isLegalICmpImmediate(int64_t Imm) const override;
   bool isLegalAddImmediate(int64_t Imm) const override;
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
@@ -379,10 +380,9 @@ public:
   const char *getTargetNodeName(unsigned Opcode) const override;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                               const std::string &Constraint,
-                               MVT VT) const override;
+                               StringRef Constraint, MVT VT) const override;
   TargetLowering::ConstraintType
-    getConstraintType(const std::string &Constraint) const override;
+  getConstraintType(StringRef Constraint) const override;
   TargetLowering::ConstraintWeight
     getSingleConstraintMatchWeight(AsmOperandInfo &info,
                                    const char *constraint) const override;
@@ -391,8 +391,7 @@ public:
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
 
-  unsigned getInlineAsmMemConstraint(
-      const std::string &ConstraintCode) const override {
+  unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
     if (ConstraintCode.size() == 1) {
       switch(ConstraintCode[0]) {
       default:
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 7cabea962e91..dc7bd25d7ed5 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -36,7 +36,7 @@ SystemZRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
 BitVector
 SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const SystemZFrameLowering *TFI = getFrameLowering(MF);
 
   if (TFI->hasFP(MF)) {
     // R11D is the frame pointer.  Reserve all aliases.
@@ -64,7 +64,7 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   MachineFunction &MF = *MBB.getParent();
   auto *TII =
       static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const SystemZFrameLowering *TFI = getFrameLowering(MF);
   DebugLoc DL = MI->getDebugLoc();
 
   // Decompose the frame index into a base and offset.
@@ -135,6 +135,6 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
 unsigned
 SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const SystemZFrameLowering *TFI = getFrameLowering(MF);
   return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D;
 }
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index e7e0268dbb8a..178aa3817311 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -18,12 +18,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "systemz-selectiondag-info"
 
-SystemZSelectionDAGInfo::SystemZSelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() {
-}
-
 // Decide whether it is best to use a loop or straight-line code for
 // a block operation of Size bytes with source address Src and destination
 // address Dest.  Sequence is the opcode to use for straight-line code
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index a257d6b55494..246fa3e5e656 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -22,8 +22,7 @@ class SystemZTargetMachine;
 
 class SystemZSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit SystemZSelectionDAGInfo(const DataLayout &DL);
-  ~SystemZSelectionDAGInfo();
+  explicit SystemZSelectionDAGInfo() = default;
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                                   SDValue Dst, SDValue Src,
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index eb5e5c0b9ff8..0b49fcdd8f78 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -42,7 +42,7 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       HasTransactionalExecution(false), HasProcessorAssist(false),
       HasVector(false), TargetTriple(TT),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      TSInfo(*TM.getDataLayout()), FrameLowering() {}
+      TSInfo(), FrameLowering() {}
 
 // Return true if GV binds locally under reloc model RM.
 static bool bindsLocally(const GlobalValue *GV, Reloc::Model RM) {
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index e9cabe968eea..4b80973ed879 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -29,7 +29,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
 
 public:
   explicit SystemZTTIImpl(const SystemZTargetMachine *TM, Function &F)
-    : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   SystemZTTIImpl(const SystemZTTIImpl &Arg)
@@ -37,18 +38,6 @@ public:
   SystemZTTIImpl(SystemZTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  SystemZTTIImpl &operator=(const SystemZTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  SystemZTTIImpl &operator=(SystemZTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   /// \name Scalar TTI Implementations
   /// @{
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 0b05303f71bf..83174c20c8e9 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -150,8 +150,9 @@ void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
 }
 
 TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis(
-      [this](Function &) { return TargetTransformInfo(getDataLayout()); });
+  return TargetIRAnalysis([this](Function &F) {
+    return TargetTransformInfo(F.getParent()->getDataLayout());
+  });
 }
 
 static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo,
diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index 87df7af84525..6a61fcdf0f86 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp
@@ -19,7 +19,14 @@ using namespace llvm;
 //---------------------------------------------------------------------------
 // TargetSubtargetInfo Class
 //
-TargetSubtargetInfo::TargetSubtargetInfo() {}
+TargetSubtargetInfo::TargetSubtargetInfo(
+    const Triple &TT, StringRef CPU, StringRef FS,
+    ArrayRef<SubtargetFeatureKV> PF, ArrayRef<SubtargetFeatureKV> PD,
+    const SubtargetInfoKV *ProcSched, const MCWriteProcResEntry *WPR,
+    const MCWriteLatencyEntry *WL, const MCReadAdvanceEntry *RA,
+    const InstrStage *IS, const unsigned *OC, const unsigned *FP)
+    : MCSubtargetInfo(TT, CPU, FS, PF, PD, ProcSched, WPR, WL, RA, IS, OC, FP) {
+}
 
 TargetSubtargetInfo::~TargetSubtargetInfo() {}
 
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index df04c2a3460b..25de9eee0831 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(LLVM_TARGET_DEFINITIONS WebAssembly.td)
 
 tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(WebAssemblyCommonTableGen)
 
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index d248556c62d7..224aa773a80e 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -29,6 +29,9 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_MC_DESC
 #include "WebAssemblyGenSubtargetInfo.inc"
 
+#define GET_REGINFO_MC_DESC
+#include "WebAssemblyGenRegisterInfo.inc"
+
 static MCAsmInfo *createWebAssemblyMCAsmInfo(const MCRegisterInfo &MRI,
                                              const Triple &TT) {
   MCAsmInfo *MAI = new WebAssemblyMCAsmInfo(TT);
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 24893daec7ea..eebf5b72f62b 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -47,6 +47,9 @@ MCAsmBackend *createWebAssemblyAsmBackend(const Target &T,
 // Defines symbolic names for WebAssembly registers. This defines a mapping from
 // register name to register number.
 //
+#define GET_REGINFO_ENUM
+#include "WebAssemblyGenRegisterInfo.inc"
+
 #define GET_SUBTARGETINFO_ENUM
 #include "WebAssemblyGenSubtargetInfo.inc"
 
diff --git a/lib/Target/WebAssembly/Makefile b/lib/Target/WebAssembly/Makefile
index 35d835c6506c..f102d73f6e86 100644
--- a/lib/Target/WebAssembly/Makefile
+++ b/lib/Target/WebAssembly/Makefile
@@ -12,7 +12,8 @@ LIBRARYNAME = LLVMWebAssemblyCodeGen
 TARGET = WebAssembly
 
 # Make sure that tblgen is run, first thing.
-BUILT_SOURCES = WebAssemblyGenSubtargetInfo.inc WebAssemblyGenMCCodeEmitter.inc
+BUILT_SOURCES = WebAssemblyGenRegisterInfo.inc WebAssemblyGenSubtargetInfo.inc \
+		WebAssemblyGenMCCodeEmitter.inc
 
 DIRS = InstPrinter TargetInfo MCTargetDesc
 
diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index 7a71060a638f..63e02c455895 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -12,4 +12,15 @@ binary encoding of WebAssembly itself:
   * https://github.com/WebAssembly/design/blob/master/AstSemantics.md
   * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
 
+Interesting work that remains to be done:
+* Write a pass to restructurize irreducible control flow. This needs to be done
+  before register allocation to be efficient, because it may duplicate basic
+  blocks and WebAssembly performs register allocation at a whole-function
+  level. Note that LLVM's GPU code has such a pass, but it linearizes control
+  flow (e.g. both sides of branches execute and are masked) which is undesirable
+  for WebAssembly.
+* Basic relooper to expose control flow as an AST.
+* Figure out how to properly use MC for virtual ISAs. This may require some
+  refactoring of MC.
+
 //===---------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4eec02efbd94..4184eb6dc5a6 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -38,6 +38,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   // WebAssembly does not produce floating-point exceptions on normal floating
   // point operations.
   setHasFloatingPointExceptions(false);
+  // We don't know the microarchitecture here, so just reduce register pressure.
+  setSchedulingPreference(Sched::RegPressure);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index 35e88eec8573..64415658ed81 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -6,9 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// WebAssembly Atomic operand code-gen constructs.
-//
+///
+/// \file
+/// \brief WebAssembly Atomic operand code-gen constructs.
+///
 //===----------------------------------------------------------------------===//
 
 // TODO: Implement atomic instructions.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
new file mode 100644
index 000000000000..6b5b6cd54173
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -0,0 +1,21 @@
+//===- WebAssemblyInstrCall.td-WebAssembly Call codegen support -*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Call operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+/*
+ * TODO(jfb): Add the following.
+ *
+ * call_direct: call function directly
+ * call_indirect: call function indirectly
+ * addressof: obtain a function pointer value for a given function
+ */
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
new file mode 100644
index 000000000000..3fa29061b1de
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -0,0 +1,44 @@
+//===-- WebAssemblyInstrConv.td-WebAssembly Conversion support -*- tablegen -*-=
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly datatype conversions, truncations, reinterpretations,
+/// promotions, and demotions operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+/*
+ * TODO(jfb): Add the following.
+ *
+ * int32.wrap[int64]: wrap a 64-bit integer to a 32-bit integer
+ * int32.trunc_signed[float32]: truncate a 32-bit float to a signed 32-bit integer
+ * int32.trunc_signed[float64]: truncate a 64-bit float to a signed 32-bit integer
+ * int32.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 32-bit integer
+ * int32.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 32-bit integer
+ * int32.reinterpret[float32]: reinterpret the bits of a 32-bit float as a 32-bit integer
+ * int64.extend_signed[int32]: extend a signed 32-bit integer to a 64-bit integer
+ * int64.extend_unsigned[int32]: extend an unsigned 32-bit integer to a 64-bit integer
+ * int64.trunc_signed[float32]: truncate a 32-bit float to a signed 64-bit integer
+ * int64.trunc_signed[float64]: truncate a 64-bit float to a signed 64-bit integer
+ * int64.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 64-bit integer
+ * int64.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 64-bit integer
+ * int64.reinterpret[float64]: reinterpret the bits of a 64-bit float as a 64-bit integer
+ * float32.demote[float64]: demote a 64-bit float to a 32-bit float
+ * float32.cvt_signed[int32]: convert a signed 32-bit integer to a 32-bit float
+ * float32.cvt_signed[int64]: convert a signed 64-bit integer to a 32-bit float
+ * float32.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 32-bit float
+ * float32.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 32-bit float
+ * float32.reinterpret[int32]: reinterpret the bits of a 32-bit integer as a 32-bit float
+ * float64.promote[float32]: promote a 32-bit float to a 64-bit float
+ * float64.cvt_signed[int32]: convert a signed 32-bit integer to a 64-bit float
+ * float64.cvt_signed[int64]: convert a signed 64-bit integer to a 64-bit float
+ * float64.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 64-bit float
+ * float64.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 64-bit float
+ * float64.reinterpret[int64]: reinterpret the bits of a 64-bit integer as a 64-bit float
+ */
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
new file mode 100644
index 000000000000..30ef6339d65a
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -0,0 +1,44 @@
+// WebAssemblyInstrFloat.td-WebAssembly Float codegen support ---*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Floating-point operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+defm FADD : BinaryFP<fadd>;
+defm FSUB : BinaryFP<fsub>;
+defm FMUL : BinaryFP<fmul>;
+defm FDIV : BinaryFP<fdiv>;
+defm FABS : UnaryFP<fabs>;
+defm FNEG : UnaryFP<fneg>;
+defm COPYSIGN : BinaryFP<fcopysign>;
+defm CEIL : UnaryFP<fceil>;
+defm FLOOR : UnaryFP<ffloor>;
+defm TRUNC : UnaryFP<ftrunc>;
+defm NEARESTINT : UnaryFP<fnearbyint>;
+
+/*
+ * TODO(jfb): Add the following for 32-bit and 64-bit.
+ *
+ * float32.eq: compare equal
+ * float32.lt: less than
+ * float32.le: less than or equal
+ * float32.gt: greater than
+ * float32.ge: greater than or equal
+ */
+
+defm SQRT : UnaryFP<fsqrt>;
+
+/*
+ * TODO(jfb): Add the following for 32-bit and 64-bit.
+ *
+ * float32.min: minimum (binary operator); if either operand is NaN, returns NaN
+ * float32.max: maximum (binary operator); if either operand is NaN, returns NaN
+ */
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 8bbf3e9ec87b..513c36fa2ec2 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -6,9 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// WebAssembly instruction format definitions.
-//
+///
+/// \file
+/// \brief WebAssembly instruction format definitions.
+///
 //===----------------------------------------------------------------------===//
 
 // WebAssembly Instruction Format
@@ -26,3 +27,29 @@ class I<dag oops, dag iops, list<dag> pattern, string cstr = "">
   dag InOperandList  = iops;
   let Pattern        = pattern;
 }
+
+// Unary and binary instructions, for the local types that WebAssembly supports.
+multiclass UnaryInt<SDNode node> {
+  def _I32 : I<(outs Int32:$dst), (ins Int32:$src),
+               [(set Int32:$dst, (node Int32:$src))]>;
+  def _I64 : I<(outs Int64:$dst), (ins Int64:$src),
+               [(set Int64:$dst, (node Int64:$src))]>;
+}
+multiclass BinaryInt<SDNode node> {
+  def _I32 : I<(outs Int32:$dst), (ins Int32:$lhs, Int32:$rhs),
+               [(set Int32:$dst, (node Int32:$lhs, Int32:$rhs))]>;
+  def _I64 : I<(outs Int64:$dst), (ins Int64:$lhs, Int64:$rhs),
+               [(set Int64:$dst, (node Int64:$lhs, Int64:$rhs))]>;
+}
+multiclass UnaryFP<SDNode node> {
+  def _F32 : I<(outs Float32:$dst), (ins Float32:$src),
+               [(set Float32:$dst, (node Float32:$src))]>;
+  def _F64 : I<(outs Float64:$dst), (ins Float64:$src),
+               [(set Float64:$dst, (node Float64:$src))]>;
+}
+multiclass BinaryFP<SDNode node> {
+  def _F32 : I<(outs Float32:$dst), (ins Float32:$lhs, Float32:$rhs),
+               [(set Float32:$dst, (node Float32:$lhs, Float32:$rhs))]>;
+  def _F64 : I<(outs Float64:$dst), (ins Float64:$lhs, Float64:$rhs),
+               [(set Float64:$dst, (node Float64:$lhs, Float64:$rhs))]>;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 142eccfbcaa5..fe3ca76dc08a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -6,9 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// WebAssembly Instruction definitions.
-//
+///
+/// \file
+/// \brief WebAssembly Instruction definitions.
+///
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
@@ -32,6 +33,13 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
 // WebAssembly-specific Operands.
 //===----------------------------------------------------------------------===//
 
+/*
+ * TODO(jfb): Add the following.
+ *
+ * get_local: read the current value of a local variable
+ * set_local: set the current value of a local variable
+*/
+
 //===----------------------------------------------------------------------===//
 // WebAssembly Instruction Format Definitions.
 //===----------------------------------------------------------------------===//
@@ -42,5 +50,10 @@ include "WebAssemblyInstrFormats.td"
 // Additional sets of instructions.
 //===----------------------------------------------------------------------===//
 
+include "WebAssemblyInstrMemory.td"
+include "WebAssemblyInstrCall.td"
+include "WebAssemblyInstrInteger.td"
+include "WebAssemblyInstrFloat.td"
+include "WebAssemblyInstrConv.td"
 include "WebAssemblyInstrAtomics.td"
 include "WebAssemblyInstrSIMD.td"
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
new file mode 100644
index 000000000000..5f60fe81b1a2
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -0,0 +1,45 @@
+// WebAssemblyInstrInteger.td-WebAssembly Integer codegen -------*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Integer operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+defm ADD : BinaryInt<add>;
+defm SUB : BinaryInt<sub>;
+defm MUL : BinaryInt<mul>;
+defm SDIV : BinaryInt<sdiv>;
+defm UDIV : BinaryInt<udiv>;
+defm SREM : BinaryInt<srem>;
+defm UREM : BinaryInt<urem>;
+defm AND : BinaryInt<and>;
+defm IOR : BinaryInt<or>;
+defm XOR : BinaryInt<xor>;
+defm SHL : BinaryInt<shl>;
+defm SHR : BinaryInt<srl>;
+defm SAR : BinaryInt<sra>;
+
+/*
+ * TODO(jfb): Add the following for 32-bit and 64-bit.
+ *
+ * int32.eq: signed-less compare equal
+ * int32.slt: signed less than
+ * int32.sle: signed less than or equal
+ * int32.ult: unsigned less than
+ * int32.ule: unsigned less than or equal
+ * int32.sgt: signed greater than
+ * int32.sge: signed greater than or equal
+ * int32.ugt: unsigned greater than
+ * int32.uge: unsigned greater than or equal
+ */
+
+defm CLZ : UnaryInt<ctlz>;
+defm CTZ : UnaryInt<cttz>;
+defm POPCNT : UnaryInt<ctpop>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
new file mode 100644
index 000000000000..5ab40e826caa
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -0,0 +1,46 @@
+// WebAssemblyInstrMemory.td-WebAssembly Memory codegen support -*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Memory operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+/*
+ * TODO(jfb): Add the following.
+ * Each has optional alignment and immediate byte offset.
+ *
+ * int32.load_sx[int8]: sign-extend to int32
+ * int32.load_sx[int16]: sign-extend to int32
+ * int32.load_zx[int8]: zero-extend to int32
+ * int32.load_zx[int16]: zero-extend to int32
+ * int32.load[int32]: (no conversion)
+ * int64.load_sx[int8]: sign-extend to int64
+ * int64.load_sx[int16]: sign-extend to int64
+ * int64.load_sx[int32]: sign-extend to int64
+ * int64.load_zx[int8]: zero-extend to int64
+ * int64.load_zx[int16]: zero-extend to int64
+ * int64.load_zx[int32]: zero-extend to int64
+ * int64.load[int64]: (no conversion)
+ * float32.load[float32]: (no conversion)
+ * float64.load[float64]: (no conversion)
+ * 
+ * int32.store[int8]: wrap int32 to int8
+ * int32.store[int16]: wrap int32 to int16
+ * int32.store[int32]: (no conversion)
+ * int64.store[int8]: wrap int64 to int8
+ * int64.store[int16]: wrap int64 to int16
+ * int64.store[int32]: wrap int64 to int32
+ * int64.store[int64]: (no conversion)
+ * float32.store[float32]: (no conversion)
+ * float64.store[float64]: (no conversion)
+ * 
+ * load_global: load the value of a given global variable
+ * store_global: store a given value to a given global variable
+ */
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index e25483ad3f7a..3e29906219d2 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -6,9 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// WebAssembly SIMD operand code-gen constructs.
-//
+///
+/// \file
+/// \brief WebAssembly SIMD operand code-gen constructs.
+///
 //===----------------------------------------------------------------------===//
 
 // TODO: Implement SIMD instructions.
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index ad24c90af6a2..385c40bf6693 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -30,4 +30,58 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-reg-info"
 
-WebAssemblyRegisterInfo::WebAssemblyRegisterInfo(const Triple &TT) : TT(TT) {}
+#define GET_REGINFO_TARGET_DESC
+#include "WebAssemblyGenRegisterInfo.inc"
+
+WebAssemblyRegisterInfo::WebAssemblyRegisterInfo(const Triple &TT)
+    : WebAssemblyGenRegisterInfo(0), TT(TT) {}
+
+const MCPhysReg *
+WebAssemblyRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const {
+  static const MCPhysReg CalleeSavedRegs[] = {0};
+  return CalleeSavedRegs;
+}
+
+BitVector
+WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  for (auto Reg : {WebAssembly::SP32, WebAssembly::SP64, WebAssembly::FP32,
+                   WebAssembly::FP64})
+    Reserved.set(Reg);
+  return Reserved;
+}
+
+void WebAssemblyRegisterInfo::eliminateFrameIndex(
+    MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum,
+    RegScavenger *RS) const {
+  llvm_unreachable("WebAssemblyRegisterInfo::eliminateFrameIndex"); // FIXME
+}
+
+unsigned
+WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  static const unsigned Regs[2][2] = {
+      /*            !isArch64Bit       isArch64Bit      */
+      /* !hasFP */ {WebAssembly::SP32, WebAssembly::SP64},
+      /*  hasFP */ {WebAssembly::FP32, WebAssembly::FP64}};
+  const WebAssemblyFrameLowering *TFI = getFrameLowering(MF);
+  return Regs[TFI->hasFP(MF)][TT.isArch64Bit()];
+}
+
+bool WebAssemblyRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  return !MF.getFunction()->hasFnAttribute("no-realign-stack");
+}
+
+// FIXME: share this with other backends with identical implementation?
+bool WebAssemblyRegisterInfo::needsStackRealignment(
+    const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const WebAssemblyFrameLowering *TFI = getFrameLowering(MF);
+  const Function *F = MF.getFunction();
+  unsigned StackAlign = TFI->getStackAlignment();
+  bool requiresRealignment =
+      ((MFI->getMaxAlignment() > StackAlign) ||
+       F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::StackAlignment));
+
+  return requiresRealignment && canRealignStack(MF);
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
index 55300287a51e..dbdb9d0457af 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
@@ -16,6 +16,9 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYREGISTERINFO_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYREGISTERINFO_H
 
+#define GET_REGINFO_HEADER
+#include "WebAssemblyGenRegisterInfo.inc"
+
 namespace llvm {
 
 class MachineFunction;
@@ -23,11 +26,25 @@ class RegScavenger;
 class TargetRegisterClass;
 class Triple;
 
-class WebAssemblyRegisterInfo final {
+class WebAssemblyRegisterInfo final : public WebAssemblyGenRegisterInfo {
   const Triple &TT;
 
 public:
   explicit WebAssemblyRegisterInfo(const Triple &TT);
+
+  // Code Generation virtual methods.
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  // Base pointer (stack realignment) support.
+  bool canRealignStack(const MachineFunction &MF) const;
+  bool needsStackRealignment(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 7b3d636a2605..2ba42eb94a40 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -6,10 +6,11 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file describes the WebAssembly register classes and some nominal
-// physical registers.
-//
+///
+/// \file
+/// \brief This file describes the WebAssembly register classes and some nominal
+/// physical registers.
+///
 //===----------------------------------------------------------------------===//
 
 class WebAssemblyReg<string n> : Register<n> {
@@ -23,6 +24,31 @@ class WebAssemblyRegClass<list<ValueType> regTypes, int alignment, dag regList>
 // Registers
 //===----------------------------------------------------------------------===//
 
+// Special registers used as the frame and stack pointer.
+//
+// WebAssembly may someday supports mixed 32-bit and 64-bit heaps in the same
+// application, which requires separate width FP and SP.
+def FP32 : WebAssemblyReg<"%FP32">;
+def FP64 : WebAssemblyReg<"%FP64">;
+def SP32 : WebAssemblyReg<"%SP32">;
+def SP64 : WebAssemblyReg<"%SP64">;
+
+// TODO(jfb) The following comes from NVPTX. Is it really needed, or can we do
+//           away with it? Try deleting once the backend works.
+// WebAssembly uses virtual registers, but the backend defines a few physical
+// registers here to keep SDAG and the MachineInstr layers happy.
+foreach i = 0-4 in {
+  def I#i : WebAssemblyReg<"%i."#i>; // i32
+  def L#i : WebAssemblyReg<"%l."#i>; // i64
+  def F#i : WebAssemblyReg<"%f."#i>; // f32
+  def D#i : WebAssemblyReg<"%d."#i>; // f64
+}
+
 //===----------------------------------------------------------------------===//
 //  Register classes
 //===----------------------------------------------------------------------===//
+
+def Int32 : WebAssemblyRegClass<[i32], 32, (add (sequence "I%u", 0, 4), SP32)>;
+def Int64 : WebAssemblyRegClass<[i64], 64, (add (sequence "L%u", 0, 4), SP64)>;
+def Float32 : WebAssemblyRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
+def Float64 : WebAssemblyRegClass<[f64], 64, (add (sequence "D%u", 0, 4))>;
diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index cfd1bafff236..fae9c6100510 100644
--- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -17,7 +17,4 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-selectiondag-info"
 
-WebAssemblySelectionDAGInfo::WebAssemblySelectionDAGInfo(const DataLayout *DL)
-    : TargetSelectionDAGInfo(DL) {}
-
 WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() {}
diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index 03e8d393558d..13d96671276d 100644
--- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -22,7 +22,6 @@ namespace llvm {
 
 class WebAssemblySelectionDAGInfo final : public TargetSelectionDAGInfo {
 public:
-  explicit WebAssemblySelectionDAGInfo(const DataLayout *DL);
   ~WebAssemblySelectionDAGInfo() override;
 };
 
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index addea8e3cc36..3d9e7aacbfbf 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -42,7 +42,7 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
                                            const TargetMachine &TM)
     : WebAssemblyGenSubtargetInfo(TT, CPU, FS), HasSIMD128(false),
       CPUString(CPU), TargetTriple(TT), FrameLowering(),
-      InstrInfo(initializeSubtargetDependencies(FS)),
-      TSInfo(TM.getDataLayout()), TLInfo(TM, *this) {}
+      InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
+      TLInfo(TM, *this) {}
 
 bool WebAssemblySubtarget::enableMachineScheduler() const { return true; }
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 08bd88c06985..7ffb6047b963 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -31,7 +31,6 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
   typedef TargetTransformInfo TTI;
   friend BaseT;
 
-  const WebAssemblyTargetMachine *TM;
   const WebAssemblySubtarget *ST;
   const WebAssemblyTargetLowering *TLI;
 
@@ -40,30 +39,15 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
 
 public:
   WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, Function &F)
-      : BaseT(TM), TM(TM), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   WebAssemblyTTIImpl(const WebAssemblyTTIImpl &Arg)
-      : BaseT(static_cast<const BaseT &>(Arg)), TM(Arg.TM), ST(Arg.ST),
-        TLI(Arg.TLI) {}
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
   WebAssemblyTTIImpl(WebAssemblyTTIImpl &&Arg)
-      : BaseT(std::move(static_cast<BaseT &>(Arg))), TM(std::move(Arg.TM)),
-        ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {}
-  WebAssemblyTTIImpl &operator=(const WebAssemblyTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    TM = RHS.TM;
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  WebAssemblyTTIImpl &operator=(WebAssemblyTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    TM = std::move(RHS.TM);
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
 
   /// \name Scalar TTI Implementations
   /// @{
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 3cad9fa1e2ae..91b144a44824 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -878,6 +878,29 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
+  case X86::EXTRQI:
+    if (MI->getOperand(2).isImm() &&
+        MI->getOperand(3).isImm())
+      DecodeEXTRQIMask(MI->getOperand(2).getImm(),
+                       MI->getOperand(3).getImm(),
+                       ShuffleMask);
+
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    break;
+
+  case X86::INSERTQI:
+    if (MI->getOperand(3).isImm() &&
+        MI->getOperand(4).isImm())
+      DecodeINSERTQIMask(MI->getOperand(3).getImm(),
+                         MI->getOperand(4).getImm(),
+                         ShuffleMask);
+
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    break;
+
   case X86::PMOVZXBWrr:
   case X86::PMOVZXBDrr:
   case X86::PMOVZXBQrr:
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 3e0dc1424609..629802f5dc5e 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -220,7 +220,6 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) {
   case X86::PUSH32i8:  return X86::PUSHi32;
   case X86::PUSH16i8:  return X86::PUSHi16;
   case X86::PUSH64i8:  return X86::PUSH64i32;
-  case X86::PUSH64i16: return X86::PUSH64i32;
   }
 }
 
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
index 89f394582631..ddb764facdbf 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -34,14 +34,16 @@ public:
       report_fatal_error(EC.message());
     StringRef SymName = *SymNameOrErr;
 
-    uint64_t  SymAddr; SymI->getAddress(SymAddr);
+    ErrorOr<uint64_t> SymAddr = SymI->getAddress();
+    if (std::error_code EC = SymAddr.getError())
+      report_fatal_error(EC.message());
     uint64_t SymSize = SymI->getSize();
     int64_t Addend = *ELFRelocationRef(Rel).getAddend();
 
     MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName);
     // FIXME: check that the value is actually the same.
     if (!Sym->isVariable())
-      Sym->setVariableValue(MCConstantExpr::create(SymAddr, Ctx));
+      Sym->setVariableValue(MCConstantExpr::create(*SymAddr, Ctx));
 
     const MCExpr *Expr = nullptr;
     // If hasAddend is true, then we need to add Addend (r_addend) to Expr.
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 431010d4cbc2..83b4091d7665 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -88,9 +88,7 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
   if (CPUName.empty())
     CPUName = "generic";
 
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitX86MCSubtargetInfo(X, TT, CPUName, ArchFS);
-  return X;
+  return createX86MCSubtargetInfoImpl(TT, CPUName, ArchFS);
 }
 
 static MCInstrInfo *createX86MCInstrInfo() {
@@ -99,17 +97,14 @@ static MCInstrInfo *createX86MCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
-  Triple TheTriple(TT);
-  unsigned RA = (TheTriple.getArch() == Triple::x86_64)
-    ? X86::RIP     // Should have dwarf #16.
-    : X86::EIP;    // Should have dwarf #8.
+static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) {
+  unsigned RA = (TT.getArch() == Triple::x86_64)
+                    ? X86::RIP  // Should have dwarf #16.
+                    : X86::EIP; // Should have dwarf #8.
 
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitX86MCRegisterInfo(X, RA,
-                        X86_MC::getDwarfRegFlavour(TheTriple, false),
-                        X86_MC::getDwarfRegFlavour(TheTriple, true),
-                        RA);
+  InitX86MCRegisterInfo(X, RA, X86_MC::getDwarfRegFlavour(TT, false),
+                        X86_MC::getDwarfRegFlavour(TT, true), RA);
   X86_MC::InitLLVM2SEHRegisterMapping(X);
   return X;
 }
@@ -156,24 +151,23 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createX86MCCodeGenInfo(const Triple &TT, Reloc::Model RM,
                                              CodeModel::Model CM,
                                              CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
 
-  Triple T(TT);
-  bool is64Bit = T.getArch() == Triple::x86_64;
+  bool is64Bit = TT.getArch() == Triple::x86_64;
 
   if (RM == Reloc::Default) {
     // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
     // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
     // use static relocation model by default.
-    if (T.isOSDarwin()) {
+    if (TT.isOSDarwin()) {
       if (is64Bit)
         RM = Reloc::PIC_;
       else
         RM = Reloc::DynamicNoPIC;
-    } else if (T.isOSWindows() && is64Bit)
+    } else if (TT.isOSWindows() && is64Bit)
       RM = Reloc::PIC_;
     else
       RM = Reloc::Static;
@@ -186,13 +180,13 @@ static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM,
   if (RM == Reloc::DynamicNoPIC) {
     if (is64Bit)
       RM = Reloc::PIC_;
-    else if (!T.isOSDarwin())
+    else if (!TT.isOSDarwin())
       RM = Reloc::Static;
   }
 
   // If we are on Darwin, disallow static relocation model in X86-64 mode, since
   // the Mach-O file format doesn't support it.
-  if (RM == Reloc::Static && T.isOSDarwin() && is64Bit)
+  if (RM == Reloc::Static && TT.isOSDarwin() && is64Bit)
     RM = Reloc::PIC_;
 
   // For static codegen, if we're not already set, use Small codegen.
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
index c9479b62f7b6..9bfe999424fa 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -34,7 +34,7 @@ public:
     if (std::error_code EC = SymNameOrErr.getError())
       report_fatal_error(EC.message());
     StringRef SymName = *SymNameOrErr;
-    uint64_t  SymAddr; SymI->getAddress(SymAddr);
+    uint64_t SymAddr = SymI->getValue();
 
     any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl());
     bool isPCRel = Obj->getAnyRelocationPCRel(RE);
@@ -90,8 +90,7 @@ public:
         const MCExpr *LHS = MCSymbolRefExpr::create(Sym, Ctx);
 
         symbol_iterator RSymI = Rel.getSymbol();
-        uint64_t RSymAddr;
-        RSymI->getAddress(RSymAddr);
+        uint64_t RSymAddr = RSymI->getValue();
         ErrorOr<StringRef> RSymName = RSymI->getName();
         if (std::error_code EC = RSymName.getError())
           report_fatal_error(EC.message());
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index ef3318ba7580..cae865a40819 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -255,15 +255,13 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
 
 void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask) {
-  if (Imm & 0x88)
-    return; // Not a shuffle
-
   unsigned HalfSize = VT.getVectorNumElements() / 2;
 
   for (unsigned l = 0; l != 2; ++l) {
-    unsigned HalfBegin = ((Imm >> (l * 4)) & 0x3) * HalfSize;
+    unsigned HalfMask = Imm >> (l * 4);
+    unsigned HalfBegin = (HalfMask & 0x3) * HalfSize;
     for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i)
-      ShuffleMask.push_back(i);
+      ShuffleMask.push_back(HalfMask & 8 ? SM_SentinelZero : (int)i);
   }
 }
 
@@ -431,4 +429,78 @@ void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
   for (unsigned i = 1; i < NumElts; i++)
     Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
 }
+
+void DecodeEXTRQIMask(int Len, int Idx,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  // Only the bottom 6 bits are valid for each immediate.
+  Len &= 0x3F;
+  Idx &= 0x3F;
+
+  // We can only decode this bit extraction instruction as a shuffle if both the
+  // length and index work with whole bytes.
+  if (0 != (Len % 8) || 0 != (Idx % 8))
+    return;
+
+  // A length of zero is equivalent to a bit length of 64.
+  if (Len == 0)
+    Len = 64;
+
+  // If the length + index exceeds the bottom 64 bits the result is undefined.
+  if ((Len + Idx) > 64) {
+    ShuffleMask.append(16, SM_SentinelUndef);
+    return;
+  }
+
+  // Convert index and index to work with bytes.
+  Len /= 8;
+  Idx /= 8;
+
+  // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes
+  // of the lower 64-bits. The upper 64-bits are undefined.
+  for (int i = 0; i != Len; ++i)
+    ShuffleMask.push_back(i + Idx);
+  for (int i = Len; i != 8; ++i)
+    ShuffleMask.push_back(SM_SentinelZero);
+  for (int i = 8; i != 16; ++i)
+    ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeINSERTQIMask(int Len, int Idx,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  // Only the bottom 6 bits are valid for each immediate.
+  Len &= 0x3F;
+  Idx &= 0x3F;
+
+  // We can only decode this bit insertion instruction as a shuffle if both the
+  // length and index work with whole bytes.
+  if (0 != (Len % 8) || 0 != (Idx % 8))
+    return;
+
+  // A length of zero is equivalent to a bit length of 64.
+  if (Len == 0)
+    Len = 64;
+
+  // If the length + index exceeds the bottom 64 bits the result is undefined.
+  if ((Len + Idx) > 64) {
+    ShuffleMask.append(16, SM_SentinelUndef);
+    return;
+  }
+
+  // Convert index and index to work with bytes.
+  Len /= 8;
+  Idx /= 8;
+
+  // INSERTQ: Extract lowest Len bytes from lower half of second source and
+  // insert over first source starting at Idx byte. The upper 64-bits are
+  // undefined.
+  for (int i = 0; i != Idx; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = 0; i != Len; ++i)
+    ShuffleMask.push_back(i + 16);
+  for (int i = Idx + Len; i != 8; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = 8; i != 16; ++i)
+    ShuffleMask.push_back(SM_SentinelUndef);
+}
+
 } // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 14b69434806e..3d10d18e860e 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -100,6 +100,14 @@ void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 /// \brief Decode a scalar float move instruction as a shuffle mask.
 void DecodeScalarMoveMask(MVT VT, bool IsLoad,
                           SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask.
+void DecodeEXTRQIMask(int Len, int Idx,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
+void DecodeINSERTQIMask(int Len, int Idx,
+                        SmallVectorImpl<int> &ShuffleMask);
 } // llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 02645460b6a2..b4319c8bb04f 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -317,7 +317,7 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
 }
 
 bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
-  EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
+  EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
   if (evt == MVT::Other || !evt.isSimple())
     // Unhandled type. Halt "fast" selection and bail.
     return false;
@@ -608,7 +608,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
         // Prepare for inserting code in the local-value area.
         SavePoint SaveInsertPt = enterLocalValueArea();
 
-        if (TLI.getPointerTy() == MVT::i64) {
+        if (TLI.getPointerTy(DL) == MVT::i64) {
           Opc = X86::MOV64rm;
           RC  = &X86::GR64RegClass;
 
@@ -690,13 +690,14 @@ redo_gep:
 
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs.
-    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
       return X86SelectAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints.
-    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return X86SelectAddress(U->getOperand(0), AM);
     break;
 
@@ -866,14 +867,14 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs if its operand is in the same BB.
     if (InMBB &&
-        TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+        TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+            TLI.getPointerTy(DL))
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints if its operand is in the same BB.
-    if (InMBB &&
-        TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
   }
@@ -1000,7 +1001,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 
   if (Ret->getNumOperands() > 0) {
     SmallVector<ISD::OutputArg, 4> Outs;
-    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
@@ -1031,7 +1032,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
       return false;
 
     unsigned SrcReg = Reg + VA.getValNo();
-    EVT SrcVT = TLI.getValueType(RV->getType());
+    EVT SrcVT = TLI.getValueType(DL, RV->getType());
     EVT DstVT = VA.getValVT();
     // Special handling for extended integers.
     if (SrcVT != DstVT) {
@@ -1300,7 +1301,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectZExt(const Instruction *I) {
-  EVT DstVT = TLI.getValueType(I->getType());
+  EVT DstVT = TLI.getValueType(DL, I->getType());
   if (!TLI.isTypeLegal(DstVT))
     return false;
 
@@ -1309,7 +1310,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
     return false;
 
   // Handle zero-extension from i1 to i8, which is common.
-  MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType());
+  MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
   if (SrcVT.SimpleTy == MVT::i1) {
     // Set the high bits to zero.
     ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
@@ -1362,7 +1363,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   X86::CondCode CC;
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
-      EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
+      EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());
 
       // Try to optimize or fold the cmp.
       CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
@@ -1802,7 +1803,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
     if (NeedSwap)
       std::swap(CmpLHS, CmpRHS);
 
-    EVT CmpVT = TLI.getValueType(CmpLHS->getType());
+    EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
     // Emit a compare of the LHS and RHS, setting the flags.
     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
       return false;
@@ -2004,7 +2005,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
     if (NeedSwap)
       std::swap(CmpLHS, CmpRHS);
 
-    EVT CmpVT = TLI.getValueType(CmpLHS->getType());
+    EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
       return false;
   } else {
@@ -2166,8 +2167,8 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectTrunc(const Instruction *I) {
-  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
-  EVT DstVT = TLI.getValueType(I->getType());
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(DL, I->getType());
 
   // This code only handles truncation to byte.
   if (DstVT != MVT::i8 && DstVT != MVT::i1)
@@ -2416,7 +2417,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
   }
   case Intrinsic::stackprotector: {
     // Emit code to store the stack guard onto the stack.
-    EVT PtrTy = TLI.getPointerTy();
+    EVT PtrTy = TLI.getPointerTy(DL);
 
     const Value *Op1 = II->getArgOperand(0); // The guard's value.
     const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
@@ -2735,7 +2736,7 @@ bool X86FastISel::fastLowerArguments() {
     if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
       return false;
 
-    EVT ArgVT = TLI.getValueType(ArgTy);
+    EVT ArgVT = TLI.getValueType(DL, ArgTy);
     if (!ArgVT.isSimple()) return false;
     switch (ArgVT.getSimpleVT().SimpleTy) {
     default: return false;
@@ -2772,7 +2773,7 @@ bool X86FastISel::fastLowerArguments() {
   unsigned GPRIdx = 0;
   unsigned FPRIdx = 0;
   for (auto const &Arg : F->args()) {
-    MVT VT = TLI.getSimpleValueType(Arg.getType());
+    MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
     const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
     unsigned SrcReg;
     switch (VT.SimpleTy) {
@@ -3108,7 +3109,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
         GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
       OpFlags = X86II::MO_PLT;
     } else if (Subtarget->isPICStyleStubAny() &&
-               (GV->isDeclaration() || GV->isWeakForLinker()) &&
+               !GV->isStrongDefinitionForLinker() &&
                (!Subtarget->getTargetTriple().isMacOSX() ||
                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
@@ -3240,8 +3241,8 @@ X86FastISel::fastSelectInstruction(const Instruction *I)  {
     return X86SelectSIToFP(I);
   case Instruction::IntToPtr: // Deliberate fall-through.
   case Instruction::PtrToInt: {
-    EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
-    EVT DstVT = TLI.getValueType(I->getType());
+    EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(DL, I->getType());
     if (DstVT.bitsGT(SrcVT))
       return X86SelectZExt(I);
     if (DstVT.bitsLT(SrcVT))
@@ -3384,7 +3385,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
     addDirectMem(MIB, AddrReg);
     MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
         MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
-        TM.getDataLayout()->getPointerSize(), Align);
+        DL.getPointerSize(), Align);
     MIB->addMemOperand(*FuncInfo.MF, MMO);
     return ResultReg;
   }
@@ -3411,17 +3412,17 @@ unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
 
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
     if (TM.getRelocationModel() == Reloc::Static &&
-        TLI.getPointerTy() == MVT::i64) {
+        TLI.getPointerTy(DL) == MVT::i64) {
       // The displacement code could be more than 32 bits away so we need to use
       // an instruction with a 64 bit immediate
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
               ResultReg)
         .addGlobalAddress(GV);
     } else {
-      unsigned Opc = TLI.getPointerTy() == MVT::i32
-                     ? (Subtarget->isTarget64BitILP32()
-                        ? X86::LEA64_32r : X86::LEA32r)
-                     : X86::LEA64r;
+      unsigned Opc =
+          TLI.getPointerTy(DL) == MVT::i32
+              ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+              : X86::LEA64r;
       addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                              TII.get(Opc), ResultReg), AM);
     }
@@ -3431,7 +3432,7 @@ unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
 }
 
 unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
 
   // Only handle simple types.
   if (!CEVT.isSimple())
@@ -3463,11 +3464,11 @@ unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
   X86AddressMode AM;
   if (!X86SelectAddress(C, AM))
     return 0;
-  unsigned Opc = TLI.getPointerTy() == MVT::i32
-                 ? (Subtarget->isTarget64BitILP32()
-                    ? X86::LEA64_32r : X86::LEA32r)
-                 : X86::LEA64r;
-  const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
+  unsigned Opc =
+      TLI.getPointerTy(DL) == MVT::i32
+          ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+          : X86::LEA64r;
+  const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
   unsigned ResultReg = createResultReg(RC);
   addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                          TII.get(Opc), ResultReg), AM);
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 40b9c8a863a3..36a8cdbab55b 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -301,8 +301,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
   bool FPIsUsed = false;
 
   static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned i = 0; i <= 6; ++i)
-    if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
+    if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
       FPIsUsed = true;
       break;
     }
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 85c5b6499131..2a35c4cf31f3 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -90,7 +90,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           TRI->needsStackRealignment(MF) ||
           MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() ||
+          MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
           MMI.callsUnwindInit() || MMI.callsEHReturn() ||
           MFI->hasStackMap() || MFI->hasPatchPoint());
@@ -967,13 +967,26 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       .addReg(StackPtr)
       .setMIFlag(MachineInstr::FrameSetup);
     if (X86FI->getRestoreBasePointer()) {
-      // Stash value of base pointer.  Saving RSP instead of EBP shortens dependence chain.
+      // Stash value of base pointer.  Saving RSP instead of EBP shortens
+      // dependence chain. Used by SjLj EH.
       unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
                    FramePtr, true, X86FI->getRestoreBasePointerOffset())
         .addReg(StackPtr)
         .setMIFlag(MachineInstr::FrameSetup);
     }
+
+    if (X86FI->getHasSEHFramePtrSave()) {
+      // Stash the value of the frame pointer relative to the base pointer for
+      // Win32 EH. This supports Win32 EH, which does the inverse of the above:
+      // it recovers the frame pointer from the base pointer rather than the
+      // other way around.
+      unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), BasePtr, true,
+                   getFrameIndexOffset(MF, X86FI->getSEHFramePtrSaveIndex()))
+          .addReg(FramePtr)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
   }
 
   if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
@@ -1412,9 +1425,11 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
-void
-X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                       RegScavenger *RS) const {
+void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1436,7 +1451,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   // Spill the BasePtr if it's used.
   if (TRI->hasBasePointer(MF))
-    MF.getRegInfo().setPhysRegUsed(TRI->getBaseRegister());
+    SavedRegs.set(TRI->getBaseRegister());
 }
 
 static bool
@@ -1667,8 +1682,6 @@ void X86FrameLowering::adjustForSegmentedStacks(
       .addImm(StackSize);
     BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
       .addImm(X86FI->getArgumentStackSize());
-    MF.getRegInfo().setPhysRegUsed(Reg10);
-    MF.getRegInfo().setPhysRegUsed(Reg11);
   } else {
     BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
       .addImm(X86FI->getArgumentStackSize());
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index c274c8820149..495cfcd1c3f7 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -68,8 +68,8 @@ public:
   void adjustForHiPEPrologue(MachineFunction &MF,
                              MachineBasicBlock &PrologueMBB) const override;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS = nullptr) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
 
   bool
   assignCalleeSavedSpillSlots(MachineFunction &MF,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 6b23e62a2d35..d5351d25d6ed 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -246,8 +246,9 @@ namespace {
                                    SDValue &Index, SDValue &Disp,
                                    SDValue &Segment) {
       Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
-                 ? CurDAG->getTargetFrameIndex(AM.Base_FrameIndex,
-                                               TLI->getPointerTy())
+                 ? CurDAG->getTargetFrameIndex(
+                       AM.Base_FrameIndex,
+                       TLI->getPointerTy(CurDAG->getDataLayout()))
                  : AM.Base_Reg;
       Scale = getI8Imm(AM.Scale, DL);
       Index = AM.IndexReg;
@@ -581,11 +582,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
 void X86DAGToDAGISel::EmitSpecialCodeForMain() {
   if (Subtarget->isTargetCygMing()) {
     TargetLowering::ArgListTy Args;
+    auto &DL = CurDAG->getDataLayout();
 
     TargetLowering::CallLoweringInfo CLI(*CurDAG);
     CLI.setChain(CurDAG->getRoot())
         .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
-                   CurDAG->getExternalSymbol("__main", TLI->getPointerTy()),
+                   CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
                    std::move(Args), 0);
     const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
@@ -1025,7 +1027,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
   switch (N.getOpcode()) {
   default: break;
-  case ISD::FRAME_ALLOC_RECOVER: {
+  case ISD::LOCAL_RECOVER: {
     if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
       if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
         // Use the symbol and don't prefix it.
@@ -1638,7 +1640,8 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
 ///
 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
-  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode();
+  auto &DL = MF->getDataLayout();
+  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
 }
 
 /// Atomic opcode table
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b16bd18aefaa..6e22ab30057c 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -76,7 +76,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     : TargetLowering(TM), Subtarget(&STI) {
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
-  TD = getDataLayout();
+  TD = TM.getDataLayout();
 
   // Set up the TargetLowering object.
   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
@@ -505,7 +505,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(*TD), Custom);
 
   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
@@ -825,6 +825,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 
+    setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
+
     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
@@ -944,6 +949,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
     }
 
+    setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
+
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
@@ -1018,6 +1032,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
 
+    setOperationAction(ISD::SRA,               MVT::v2i64, Custom);
     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
   }
 
@@ -1141,6 +1156,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
 
+      setOperationAction(ISD::SMAX,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::SMAX,            MVT::v16i16, Legal);
+      setOperationAction(ISD::SMAX,            MVT::v8i32,  Legal);
+      setOperationAction(ISD::UMAX,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::UMAX,            MVT::v16i16, Legal);
+      setOperationAction(ISD::UMAX,            MVT::v8i32,  Legal);
+      setOperationAction(ISD::SMIN,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::SMIN,            MVT::v16i16, Legal);
+      setOperationAction(ISD::SMIN,            MVT::v8i32,  Legal);
+      setOperationAction(ISD::UMIN,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::UMIN,            MVT::v16i16, Legal);
+      setOperationAction(ISD::UMIN,            MVT::v8i32,  Legal);
+
       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
       // when we have a 256bit-wide blend with immediate.
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
@@ -1184,6 +1212,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
 
+    setOperationAction(ISD::SRA,               MVT::v4i64, Custom);
     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
 
     // Custom lower several nodes for 256-bit types.
@@ -1376,6 +1405,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SELECT,             MVT::v16i1, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8i1,  Custom);
 
+    setOperationAction(ISD::SMAX,               MVT::v16i32, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v8i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v16i32, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v8i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v16i32, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v8i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
+
     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
 
@@ -1473,6 +1511,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
+    setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
+    setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
@@ -1492,6 +1532,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
 
+    setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v64i8, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v32i16, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v64i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v32i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
+
     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
       const MVT VT = (MVT::SimpleValueType)i;
 
@@ -1531,6 +1580,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
     setOperationAction(ISD::SRA,                MVT::v2i64, Custom);
     setOperationAction(ISD::SRA,                MVT::v4i64, Custom);
+
+    setOperationAction(ISD::SMAX,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v4i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v2i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v4i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v4i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v2i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v4i64, Legal);
   }
 
   // We want to custom lower some of our intrinsics.
@@ -1611,6 +1669,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
   setTargetDAGCombine(ISD::BUILD_VECTOR);
@@ -1652,7 +1711,8 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const {
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
-EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                          EVT VT) const {
   if (!VT.isVector())
     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
 
@@ -1724,10 +1784,11 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
 /// function arguments in the caller parameter area. For X86, aggregates
 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
 /// are at 4-byte boundaries.
-unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
+                                                  const DataLayout &DL) const {
   if (Subtarget->is64Bit()) {
     // Max of 8 and alignment of type.
-    unsigned TyAlign = TD->getABITypeAlignment(Ty);
+    unsigned TyAlign = DL.getABITypeAlignment(Ty);
     if (TyAlign > 8)
       return TyAlign;
     return 8;
@@ -1840,7 +1901,8 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   if (!Subtarget->is64Bit())
     // This doesn't have SDLoc associated with it, but is not really the
     // same as a Register.
-    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
+    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+                       getPointerTy(DAG.getDataLayout()));
   return Table;
 }
 
@@ -2032,7 +2094,8 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   // false, then an sret argument may be implicitly inserted in the SelDAG. In
   // either case FuncInfo->setSRetReturnReg() will have been called.
   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg,
+                                     getPointerTy(MF.getDataLayout()));
 
     unsigned RetValReg
         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
@@ -2041,7 +2104,8 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     Flag = Chain.getValue(1);
 
     // RAX/EAX now acts like a return value.
-    RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
+    RetOps.push_back(
+        DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
   }
 
   RetOps[0] = Chain;  // Update chain.
@@ -2288,11 +2352,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
     unsigned Bytes = Flags.getByValSize();
     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
-    return DAG.getFrameIndex(FI, getPointerTy());
+    return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   } else {
     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
                                     VA.getLocMemOffset(), isImmutable);
-    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     SDValue Val =  DAG.getLoad(ValVT, dl, Chain, FIN,
                                MachinePointerInfo::getFixedStack(FI),
                                false, false, false, 0);
@@ -2471,7 +2535,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
     if (Ins[i].Flags.isSRet()) {
       unsigned Reg = FuncInfo->getSRetReturnReg();
       if (!Reg) {
-        MVT PtrTy = getPointerTy();
+        MVT PtrTy = getPointerTy(DAG.getDataLayout());
         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
         FuncInfo->setSRetReturnReg(Reg);
       }
@@ -2499,7 +2563,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
   MachineModuleInfo &MMI = MF.getMMI();
   const Function *WinEHParent = nullptr;
-  if (IsWin64 && MMI.hasWinEHFuncInfo(Fn))
+  if (MMI.hasWinEHFuncInfo(Fn))
     WinEHParent = MMI.getWinEHParent(Fn);
   bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn;
   bool IsWinEHParent = WinEHParent && WinEHParent == Fn;
@@ -2561,11 +2625,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
     // Store the integer parameter registers.
     SmallVector<SDValue, 8> MemOps;
     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
-                                      getPointerTy());
+                                      getPointerTy(DAG.getDataLayout()));
     unsigned Offset = FuncInfo->getVarArgsGPOffset();
     for (SDValue Val : LiveGPRs) {
-      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
-                                DAG.getIntPtrConstant(Offset, dl));
+      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                                RSFIN, DAG.getIntPtrConstant(Offset, dl));
       SDValue Store =
         DAG.getStore(Val.getValue(1), dl, Val, FIN,
                      MachinePointerInfo::getFixedStack(
@@ -2592,7 +2656,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
     if (!MemOps.empty())
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
-  } else if (IsWinEHOutlined) {
+  } else if (IsWin64 && IsWinEHOutlined) {
     // Get to the caller-allocated home save location.  Add 8 to account
     // for the return address.
     int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
@@ -2605,8 +2669,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
     // Store the second integer parameter (rdx) into rsp+16 relative to the
     // stack pointer at the entry of the function.
-    SDValue RSFIN =
-        DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy());
+    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+                                      getPointerTy(DAG.getDataLayout()));
     unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass);
     SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64);
     Chain = DAG.getStore(
@@ -2680,14 +2744,21 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
   FuncInfo->setArgumentStackSize(StackSize);
 
   if (IsWinEHParent) {
-    int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
-    SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
-    MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
-    SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64);
-    Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
-                         MachinePointerInfo::getFixedStack(UnwindHelpFI),
-                         /*isVolatile=*/true,
-                         /*isNonTemporal=*/false, /*Alignment=*/0);
+    if (Is64Bit) {
+      int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
+      SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
+      MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
+      SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64);
+      Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
+                           MachinePointerInfo::getFixedStack(UnwindHelpFI),
+                           /*isVolatile=*/true,
+                           /*isNonTemporal=*/false, /*Alignment=*/0);
+    } else {
+      // Functions using Win32 EH are considered to have opaque SP adjustments
+      // to force local variables to be addressed from the frame or base
+      // pointers.
+      MFI->setHasOpaqueSPAdjustment(true);
+    }
   }
 
   return Chain;
@@ -2701,7 +2772,8 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
                                     ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                       StackPtr, PtrOff);
   if (Flags.isByVal())
     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
 
@@ -2718,7 +2790,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
                                            bool IsTailCall, bool Is64Bit,
                                            int FPDiff, SDLoc dl) const {
   // Adjust the Return address stack slot.
-  EVT VT = getPointerTy();
+  EVT VT = getPointerTy(DAG.getDataLayout());
   OutRetAddr = getReturnAddressFrameIndex(DAG);
 
   // Load the "old" Return address.
@@ -2942,7 +3014,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       assert(VA.isMemLoc());
       if (!StackPtr.getNode())
         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
-                                      getPointerTy());
+                                      getPointerTy(DAG.getDataLayout()));
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
                                              dl, DAG, VA, Flags));
     }
@@ -2955,8 +3027,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // ELF / PIC requires GOT in the EBX register before function calls via PLT
     // GOT pointer.
     if (!isTailCall) {
-      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
-               DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
+      RegsToPass.push_back(std::make_pair(
+          unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+                                          getPointerTy(DAG.getDataLayout()))));
     } else {
       // If we are tail calling and generating PIC/GOT style code load the
       // address of the callee into ECX. The value in ecx is used as target of
@@ -3036,16 +3109,16 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       int32_t Offset = VA.getLocMemOffset()+FPDiff;
       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
-      FIN = DAG.getFrameIndex(FI, getPointerTy());
+      FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
 
       if (Flags.isByVal()) {
         // Copy relative to framepointer.
         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
         if (!StackPtr.getNode())
-          StackPtr = DAG.getCopyFromReg(Chain, dl,
-                                        RegInfo->getStackRegister(),
-                                        getPointerTy());
-        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                                        getPointerTy(DAG.getDataLayout()));
+        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                             StackPtr, Source);
 
         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
                                                          ArgChain,
@@ -3064,8 +3137,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     // Store the return address to the appropriate stack slot.
     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
-                                     getPointerTy(), RegInfo->getSlotSize(),
-                                     FPDiff, dl);
+                                     getPointerTy(DAG.getDataLayout()),
+                                     RegInfo->getSlotSize(), FPDiff, dl);
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
@@ -3106,7 +3179,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
         OpFlags = X86II::MO_PLT;
       } else if (Subtarget->isPICStyleStubAny() &&
-                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
+                 !GV->isStrongDefinitionForLinker() &&
                  (!Subtarget->getTargetTriple().isMacOSX() ||
                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
         // PC-relative references to external symbols should go through $stub,
@@ -3123,17 +3196,18 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         ExtraLoad = true;
       }
 
-      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
-                                          G->getOffset(), OpFlags);
+      Callee = DAG.getTargetGlobalAddress(
+          GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
 
       // Add a wrapper if needed.
       if (WrapperKind != ISD::DELETED_NODE)
-        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
+        Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
+                             getPointerTy(DAG.getDataLayout()), Callee);
       // Add extra indirection if needed.
       if (ExtraLoad)
-        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
-                             MachinePointerInfo::getGOT(),
-                             false, false, false, 0);
+        Callee = DAG.getLoad(
+            getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
+            MachinePointerInfo::getGOT(), false, false, false, 0);
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     unsigned char OpFlags = 0;
@@ -3152,8 +3226,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       OpFlags = X86II::MO_DARWIN_STUB;
     }
 
-    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
-                                         OpFlags);
+    Callee = DAG.getTargetExternalSymbol(
+        S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
   } else if (Subtarget->isTarget64BitILP32() &&
              Callee->getValueType(0) == MVT::i32) {
     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
@@ -3184,9 +3258,24 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
-  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+  const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
+
+  // If this is an invoke in a 32-bit function using an MSVC personality, assume
+  // the function clobbers all registers. If an exception is thrown, the runtime
+  // will not restore CSRs.
+  // FIXME: Model this more precisely so that we can register allocate across
+  // the normal edge and spill and fill across the exceptional edge.
+  if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
+    const Function *CallerFn = MF.getFunction();
+    EHPersonality Pers =
+        CallerFn->hasPersonalityFn()
+            ? classifyEHPersonality(CallerFn->getPersonalityFn())
+            : EHPersonality::Unknown;
+    if (isMSVCEHPersonality(Pers))
+      Mask = RegInfo->getNoPreservedMask();
+  }
+
   Ops.push_back(DAG.getRegisterMask(Mask));
 
   if (InFlag.getNode())
@@ -3650,7 +3739,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
 
-  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
 }
 
 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
@@ -3881,6 +3970,15 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
   return Subtarget->hasLZCNT();
 }
 
+/// isUndefInRange - Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is undef.
+static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
+  for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
+    if (0 <= Mask[i])
+      return false;
+  return true;
+}
+
 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
 /// the specified range (L, H].
 static bool isUndefOrInRange(int Val, int Low, int Hi) {
@@ -4322,6 +4420,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
 /// shuffles which use a single input multiple times, and in those cases it will
 /// adjust the mask to only have indices within that single input.
+/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero.
 static bool getTargetShuffleMask(SDNode *N, MVT VT,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
@@ -4451,6 +4550,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     if (Mask.empty()) return false;
+    // Mask only contains negative index if an element is zero.
+    if (std::any_of(Mask.begin(), Mask.end(),
+                    [](int M){ return M == SM_SentinelZero; }))
+      return false;
     break;
   case X86ISD::MOVSLDUP:
     DecodeMOVSLDUPMask(VT, Mask);
@@ -4764,7 +4867,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
   MVT ShVT = MVT::v2i64;
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   SrcOp = DAG.getBitcast(ShVT, SrcOp);
-  MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
+  MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
@@ -5082,7 +5185,8 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
       assert(C && "Invalid constant type");
 
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+      SDValue CP =
+          DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
                        MachinePointerInfo::getConstantPool(),
@@ -6857,6 +6961,136 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
   return SDValue();
 }
 
+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
+                                           SDValue V2, ArrayRef<int> Mask,
+                                           SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+
+  int Size = Mask.size();
+  int HalfSize = Size / 2;
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  // Upper half must be undefined.
+  if (!isUndefInRange(Mask, HalfSize, HalfSize))
+    return SDValue();
+
+  // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+  // Remainder of lower half result is zero and upper half is all undef.
+  auto LowerAsEXTRQ = [&]() {
+    // Determine the extraction length from the part of the
+    // lower half that isn't zeroable.
+    int Len = HalfSize;
+    for (; Len >= 0; --Len)
+      if (!Zeroable[Len - 1])
+        break;
+    assert(Len > 0 && "Zeroable shuffle mask");
+
+    // Attempt to match first Len sequential elements from the lower half.
+    SDValue Src;
+    int Idx = -1;
+    for (int i = 0; i != Len; ++i) {
+      int M = Mask[i];
+      if (M < 0)
+        continue;
+      SDValue &V = (M < Size ? V1 : V2);
+      M = M % Size;
+
+      // All mask elements must be in the lower half.
+      if (M > HalfSize)
+        return SDValue();
+
+      if (Idx < 0 || (Src == V && Idx == (M - i))) {
+        Src = V;
+        Idx = M - i;
+        continue;
+      }
+      return SDValue();
+    }
+
+    if (Idx < 0)
+      return SDValue();
+
+    assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+    int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+    int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+    return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
+                       DAG.getConstant(BitLen, DL, MVT::i8),
+                       DAG.getConstant(BitIdx, DL, MVT::i8));
+  };
+
+  if (SDValue ExtrQ = LowerAsEXTRQ())
+    return ExtrQ;
+
+  // INSERTQ: Extract lowest Len elements from lower half of second source and
+  // insert over first source, starting at Idx.
+  // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+  auto LowerAsInsertQ = [&]() {
+    for (int Idx = 0; Idx != HalfSize; ++Idx) {
+      SDValue Base;
+
+      // Attempt to match first source from mask before insertion point.
+      if (isUndefInRange(Mask, 0, Idx)) {
+        /* EMPTY */
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+        Base = V1;
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+        Base = V2;
+      } else {
+        continue;
+      }
+
+      // Extend the extraction length looking to match both the insertion of
+      // the second source and the remaining elements of the first.
+      for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+        SDValue Insert;
+        int Len = Hi - Idx;
+
+        // Match insertion.
+        if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+          Insert = V1;
+        } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+          Insert = V2;
+        } else {
+          continue;
+        }
+
+        // Match the remaining elements of the lower half.
+        if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
+          /* EMPTY */
+        } else if ((!Base || (Base == V1)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
+          Base = V1;
+        } else if ((!Base || (Base == V2)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+                                              Size + Hi)) {
+          Base = V2;
+        } else {
+          continue;
+        }
+
+        // We may not have a base (first source) - this can safely be undefined.
+        if (!Base)
+          Base = DAG.getUNDEF(VT);
+
+        int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+        int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+        return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
+                           DAG.getConstant(BitLen, DL, MVT::i8),
+                           DAG.getConstant(BitIdx, DL, MVT::i8));
+      }
+    }
+
+    return SDValue();
+  };
+
+  if (SDValue InsertQ = LowerAsInsertQ())
+    return InsertQ;
+
+  return SDValue();
+}
+
 /// \brief Lower a vector shuffle as a zero or any extension.
 ///
 /// Given a specific number of elements, element bit width, and extension
@@ -6864,7 +7098,7 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
 /// features of the subtarget.
 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
-    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+    ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   assert(Scale > 1 && "Need a scale to extend.");
   int NumElements = VT.getVectorNumElements();
   int EltBits = VT.getScalarSizeInBits();
@@ -6901,6 +7135,28 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
                         getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
   }
 
+  // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
+  // to 64-bits.
+  if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
+    assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
+    assert(VT.getSizeInBits() == 128 && "Unexpected vector width!");
+
+    SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                             DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                         DAG.getConstant(EltBits, DL, MVT::i8),
+                                         DAG.getConstant(0, DL, MVT::i8)));
+    if (isUndefInRange(Mask, NumElements/2, NumElements/2))
+      return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
+
+    SDValue Hi =
+        DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                    DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                DAG.getConstant(EltBits, DL, MVT::i8),
+                                DAG.getConstant(EltBits, DL, MVT::i8)));
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
+  }
+
   // If this would require more than 2 unpack instructions to expand, use
   // pshufb when available. We can only use more than 2 unpack instructions
   // when zero extending i8 elements which also makes it easier to use pshufb.
@@ -6991,7 +7247,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
       return SDValue();
 
     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
-        DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
+        DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG);
   };
 
   // The widest scale possible for extending is to a 64-bit integer.
@@ -7166,9 +7422,9 @@ static SDValue lowerVectorShuffleAsElementInsertion(
       V2 = DAG.getBitcast(MVT::v2i64, V2);
       V2 = DAG.getNode(
           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
-          DAG.getConstant(
-              V2Index * EltVT.getSizeInBits()/8, DL,
-              DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
+          DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
+                          DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
+                              DAG.getDataLayout(), VT)));
       V2 = DAG.getBitcast(VT, V2);
     }
   }
@@ -8518,6 +8774,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
     return Shift;
 
+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget->hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
+      return V;
+
   // There are special ways we can lower some single-element blends.
   if (NumV2Inputs == 1)
     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
@@ -8670,6 +8931,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
     return ZExt;
 
+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget->hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
+      return V;
+
   int NumV2Elements =
       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
 
@@ -10613,12 +10879,13 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                     MaskEltVT.getSizeInBits());
 
       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
-                                getZeroVector(MaskVT, Subtarget, DAG, dl),
-                                Idx, DAG.getConstant(0, dl, getPointerTy()));
+                                 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
+                                 DAG.getConstant(0, dl, PtrVT));
       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
-                        Perm, DAG.getConstant(0, dl, getPointerTy()));
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
+                         DAG.getConstant(0, dl, PtrVT));
     }
     return SDValue();
   }
@@ -11009,17 +11276,16 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   else if (Subtarget->isPICStyleStubPIC())
     OpFlag = X86II::MO_PIC_BASE_OFFSET;
 
-  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
-                                             CP->getAlignment(),
-                                             CP->getOffset(), OpFlag);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetConstantPool(
+      CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
   SDLoc DL(CP);
-  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
   // With PIC, the address is actually $g + Offset.
   if (OpFlag) {
-    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     SDLoc(), getPointerTy()),
-                         Result);
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   }
 
   return Result;
@@ -11042,17 +11308,16 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   else if (Subtarget->isPICStyleStubPIC())
     OpFlag = X86II::MO_PIC_BASE_OFFSET;
 
-  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
-                                          OpFlag);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
   SDLoc DL(JT);
-  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (OpFlag)
-    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     SDLoc(), getPointerTy()),
-                         Result);
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
 
   return Result;
 }
@@ -11080,24 +11345,24 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
     OpFlag = X86II::MO_DARWIN_NONLAZY;
   }
 
-  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
 
   SDLoc DL(Op);
-  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
       !Subtarget->is64Bit()) {
-    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     SDLoc(), getPointerTy()),
-                         Result);
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   }
 
   // For symbols that require a load from a stub to get the address, emit the
   // load.
   if (isGlobalStubReference(OpFlag))
-    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
+    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
                          MachinePointerInfo::getGOT(), false, false, false, 0);
 
   return Result;
@@ -11112,20 +11377,19 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   SDLoc dl(Op);
-  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
-                                             OpFlags);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
-    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
   else
-    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+    Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (isGlobalRelativeToPICBase(OpFlags)) {
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
-                         Result);
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   }
 
   return Result;
@@ -11139,40 +11403,40 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
   unsigned char OpFlags =
       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
   CodeModel::Model M = DAG.getTarget().getCodeModel();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result;
   if (OpFlags == X86II::MO_NO_FLAG &&
       X86::isOffsetSuitableForCodeModel(Offset, M)) {
     // A direct static reference to a global.
-    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
+    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
     Offset = 0;
   } else {
-    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
+    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
   }
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
-    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
   else
-    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+    Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (isGlobalRelativeToPICBase(OpFlags)) {
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
-                         Result);
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   }
 
   // For globals that require a load from a stub to get the address, emit the
   // load.
   if (isGlobalStubReference(OpFlags))
-    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+    Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
                          MachinePointerInfo::getGOT(), false, false, false, 0);
 
   // If there was a non-zero offset that we didn't fold, create an explicit
   // addition for it.
   if (Offset != 0)
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
-                         DAG.getConstant(Offset, dl, getPointerTy()));
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
+                         DAG.getConstant(Offset, dl, PtrVT));
 
   return Result;
 }
@@ -11336,22 +11600,23 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GA->getGlobal();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Subtarget->isTargetELF()) {
     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
     switch (model) {
       case TLSModel::GeneralDynamic:
         if (Subtarget->is64Bit())
-          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
-        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+          return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+        return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
       case TLSModel::LocalDynamic:
-        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
+        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
                                            Subtarget->is64Bit());
       case TLSModel::InitialExec:
       case TLSModel::LocalExec:
-        return LowerToTLSExecModel(
-            GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
-            DAG.getTarget().getRelocationModel() == Reloc::PIC_);
+        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(),
+                                   DAG.getTarget().getRelocationModel() ==
+                                       Reloc::PIC_);
     }
     llvm_unreachable("Unknown TLS model.");
   }
@@ -11374,13 +11639,12 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
                                                 GA->getValueType(0),
                                                 GA->getOffset(), OpFlag);
-    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+    SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
 
     // With PIC32, the address is actually $g + Offset.
     if (PIC32)
-      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                           DAG.getNode(X86ISD::GlobalBaseReg,
-                                       SDLoc(), getPointerTy()),
+      Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
+                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
                            Offset);
 
     // Lowering the machine isd will make sure everything is in the right
@@ -11397,8 +11661,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // And our return value (tls address) is in the standard call return value
     // location.
     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
-    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
-                              Chain.getValue(1));
+    return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
   }
 
   if (Subtarget->isTargetKnownWindowsMSVC() ||
@@ -11426,50 +11689,50 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                                         : Type::getInt32PtrTy(*DAG.getContext(),
                                                               257));
 
-    SDValue TlsArray =
-        Subtarget->is64Bit()
-            ? DAG.getIntPtrConstant(0x58, dl)
-            : (Subtarget->isTargetWindowsGNU()
-                   ? DAG.getIntPtrConstant(0x2C, dl)
-                   : DAG.getExternalSymbol("_tls_array", getPointerTy()));
+    SDValue TlsArray = Subtarget->is64Bit()
+                           ? DAG.getIntPtrConstant(0x58, dl)
+                           : (Subtarget->isTargetWindowsGNU()
+                                  ? DAG.getIntPtrConstant(0x2C, dl)
+                                  : DAG.getExternalSymbol("_tls_array", PtrVT));
 
     SDValue ThreadPointer =
-        DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
-                    MachinePointerInfo(Ptr), false, false, false, 0);
+        DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false,
+                    false, false, 0);
 
     SDValue res;
     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
       res = ThreadPointer;
     } else {
       // Load the _tls_index variable
-      SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
+      SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
       if (Subtarget->is64Bit())
-        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, IDX,
+        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
                              MachinePointerInfo(), MVT::i32, false, false,
                              false, 0);
       else
-        IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
-                          false, false, false, 0);
+        IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false,
+                          false, false, 0);
 
-      SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), dl,
-                                      getPointerTy());
-      IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
+      auto &DL = DAG.getDataLayout();
+      SDValue Scale =
+          DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
+      IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
 
-      res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
+      res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
     }
 
-    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
-                      false, false, false, 0);
+    res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false,
+                      false, 0);
 
     // Get the offset of start of .tls section
     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                              GA->getValueType(0),
                                              GA->getOffset(), X86II::MO_SECREL);
-    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
+    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
 
     // The address of the thread local variable is the add of the thread
     // pointer with the offset of the variable.
-    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
+    return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
   }
 
   llvm_unreachable("TLS not implemented for this target.");
@@ -11564,8 +11827,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
 
   unsigned Size = SrcVT.getSizeInBits()/8;
   MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                                StackSlot,
                                MachinePointerInfo::getFixedStack(SSFI),
@@ -11614,7 +11878,8 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
     MachineFunction &MF = DAG.getMachineFunction();
     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
-    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    auto PtrVT = getPointerTy(MF.getDataLayout());
+    SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
     Tys = DAG.getVTList(MVT::Other);
     SDValue Ops[] = {
       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
@@ -11656,7 +11921,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   // Build some magic constants.
   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   Constant *C0 = ConstantDataVector::get(*Context, CV0);
-  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
 
   SmallVector<Constant*,2> CV1;
   CV1.push_back(
@@ -11666,7 +11932,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
                                       APInt(64, 0x4530000000000000ULL))));
   Constant *C1 = ConstantVector::get(CV1);
-  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
+  SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
 
   // Load the 64-bit value into an XMM register.
   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
@@ -11882,6 +12148,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Op.getValueType().isVector())
     return lowerUINT_TO_FP_vec(Op, DAG);
@@ -11904,9 +12171,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   // Make a 64-bit buffer, and use it to build an FILD.
   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
   if (SrcVT == MVT::i32) {
-    SDValue WordOff = DAG.getConstant(4, dl, getPointerTy());
-    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
-                                     getPointerTy(), StackSlot, WordOff);
+    SDValue WordOff = DAG.getConstant(4, dl, PtrVT);
+    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff);
     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                                   StackSlot, MachinePointerInfo(),
                                   false, false, 0);
@@ -11940,22 +12206,20 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   APInt FF(32, 0x5F800000ULL);
 
   // Check whether the sign bit is set.
-  SDValue SignSet = DAG.getSetCC(dl,
-                                 getSetCCResultType(*DAG.getContext(), MVT::i64),
-                                 Op.getOperand(0),
-                                 DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
+  SDValue SignSet = DAG.getSetCC(
+      dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
+      Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
 
   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
   SDValue FudgePtr = DAG.getConstantPool(
-                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
-                                         getPointerTy());
+      ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
 
   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   SDValue Zero = DAG.getIntPtrConstant(0, dl);
   SDValue Four = DAG.getIntPtrConstant(4, dl);
   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
                                Zero, Four);
-  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
+  FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
 
   // Load the value out, extending it from f32 to f80.
   // FIXME: Avoid the extend by constructing the right constant pool?
@@ -11974,6 +12238,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   SDLoc DL(Op);
 
   EVT DstTy = Op.getValueType();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
@@ -11998,7 +12263,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   MachineFunction &MF = DAG.getMachineFunction();
   unsigned MemSize = DstTy.getSizeInBits()/8;
   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
 
   unsigned Opc;
   if (!IsSigned && isIntegerTypeFTOL(DstTy))
@@ -12032,7 +12297,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
     Chain = Value.getValue(1);
     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
-    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   }
 
   MachineMemOperand *MMO =
@@ -12403,7 +12668,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   Constant *C = ConstantInt::get(*Context, MaskElt);
   C = ConstantVector::getSplat(NumElts, C);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
@@ -12462,7 +12727,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   CV[0] = ConstantFP::get(*Context,
                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
   Constant *C = ConstantVector::get(CV);
-  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
+  auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+  SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
                               MachinePointerInfo::getConstantPool(),
                               false, false, false, 16);
@@ -12483,7 +12749,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
   }
   C = ConstantVector::get(CV);
-  CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
+  CPIdx = DAG.getConstantPool(C, PtrVT, 16);
   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                             MachinePointerInfo::getConstantPool(),
                             false, false, false, 16);
@@ -13352,8 +13618,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   if (hasMinMax) {
     switch (SetCCOpcode) {
     default: break;
-    case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
-    case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
+    case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
+    case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
     }
 
     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
@@ -14172,8 +14438,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
 
   SmallVector<SDValue, 8> Chains;
   SDValue Ptr = Ld->getBasePtr();
-  SDValue Increment =
-      DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, TLI.getPointerTy());
+  SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
+                                      TLI.getPointerTy(DAG.getDataLayout()));
   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
 
   for (unsigned i = 0; i < NumLoads; ++i) {
@@ -14613,7 +14879,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   EVT VT = Op.getNode()->getValueType(0);
 
   bool Is64Bit = Subtarget->is64Bit();
-  EVT SPTy = getPointerTy();
+  MVT SPTy = getPointerTy(DAG.getDataLayout());
 
   if (SplitStack) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -14630,8 +14896,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                              "have nested arguments.");
     }
 
-    const TargetRegisterClass *AddrRegClass =
-      getRegClassFor(getPointerTy());
+    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
@@ -14666,6 +14931,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
@@ -14674,8 +14940,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
     // memory location argument.
-    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
-                                   getPointerTy());
+    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                         MachinePointerInfo(SV), false, false, 0);
   }
@@ -14695,8 +14960,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MemOps.push_back(Store);
 
   // Store fp_offset
-  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                    FIN, DAG.getIntPtrConstant(4, DL));
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
   Store = DAG.getStore(Op.getOperand(0), DL,
                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
                                        MVT::i32),
@@ -14704,20 +14968,16 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MemOps.push_back(Store);
 
   // Store ptr to overflow_arg_area
-  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                    FIN, DAG.getIntPtrConstant(4, DL));
-  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
-                                    getPointerTy());
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
+  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
                        MachinePointerInfo(SV, 8),
                        false, false, 0);
   MemOps.push_back(Store);
 
   // Store ptr to reg_save_area.
-  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                    FIN, DAG.getIntPtrConstant(8, DL));
-  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
-                                    getPointerTy());
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(8, DL));
+  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
                        MachinePointerInfo(SV, 16), false, false, 0);
   MemOps.push_back(Store);
@@ -14739,7 +14999,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
 
   EVT ArgVT = Op.getNode()->getValueType(0);
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-  uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+  uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
   uint8_t ArgMode;
 
   // Decide which area this value should be read from.
@@ -14768,7 +15028,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
                        DAG.getConstant(ArgMode, dl, MVT::i8),
                        DAG.getConstant(Align, dl, MVT::i32)};
-  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
+  SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
                                           VTs, InstOps, MVT::i64,
                                           MachinePointerInfo(SV),
@@ -14995,6 +15255,20 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
 }
 
+static int getSEHRegistrationNodeSize(const Function *Fn) {
+  if (!Fn->hasPersonalityFn())
+    report_fatal_error(
+        "querying registration node size for function without personality");
+  // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
+  // WinEHStatePass for the full struct definition.
+  switch (classifyEHPersonality(Fn->getPersonalityFn())) {
+  case EHPersonality::MSVC_X86SEH: return 24;
+  case EHPersonality::MSVC_CXX: return 16;
+  default: break;
+  }
+  report_fatal_error("can only recover FP for MSVC EH personality functions");
+}
+
 /// When the 32-bit MSVC runtime transfers control to us, either to an outlined
 /// function or when returning to a parent frame after catching an exception, we
 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
@@ -15009,7 +15283,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
   SDLoc dl;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  MVT PtrVT = TLI.getPointerTy();
+  MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
 
   // It's possible that the parent function no longer has a personality function
   // if the exceptional code was optimized away, in which case we just return
@@ -15017,15 +15291,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
   if (!Fn->hasPersonalityFn())
     return EntryEBP;
 
-  // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
-  // WinEHStatePass for the full struct definition.
-  int RegNodeSize;
-  switch (classifyEHPersonality(Fn->getPersonalityFn())) {
-  default:
-    report_fatal_error("can only recover FP for MSVC EH personality functions");
-  case EHPersonality::MSVC_X86SEH: RegNodeSize = 24; break;
-  case EHPersonality::MSVC_CXX: RegNodeSize = 16; break;
-  }
+  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
 
   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
   // registration.
@@ -15034,7 +15300,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
           GlobalValue::getRealLinkageName(Fn->getName()));
   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
   SDValue RegNodeFrameOffset =
-      DAG.getNode(ISD::FRAME_ALLOC_RECOVER, dl, PtrVT, OffsetSymVal);
+      DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
 
   // RegNodeBase = EntryEBP - RegNodeSize
   // ParentFP = RegNodeBase - RegNodeFrameOffset
@@ -15059,6 +15325,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
     case INTR_TYPE_3OP:
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
         Op.getOperand(2), Op.getOperand(3));
+    case INTR_TYPE_4OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
     case INTR_TYPE_1OP_MASK_RM: {
       SDValue Src = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
@@ -15143,7 +15412,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       SDValue Rnd;
       if (Op.getNumOperands() == 6)
         Rnd = Op.getOperand(5);
-      else 
+      else
         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                               Src1, Src2, Rnd),
@@ -15173,7 +15442,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                               Src1, Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
-    case VPERM_3OP_MASKZ: 
+    case VPERM_3OP_MASKZ:
     case VPERM_3OP_MASK:
     case FMA_OP_MASK3:
     case FMA_OP_MASKZ:
@@ -15499,6 +15768,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
           "llvm.x86.seh.recoverfp must take a function as the first argument");
     return recoverFramePointer(DAG, Fn, IncomingFPOp);
   }
+
+  case Intrinsic::localaddress: {
+    // Returns one of the stack, base, or frame pointer registers, depending on
+    // which is used to reference local variables.
+    MachineFunction &MF = DAG.getMachineFunction();
+    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+    unsigned Reg;
+    if (RegInfo->hasBasePointer(MF))
+      Reg = RegInfo->getBaseRegister();
+    else // This function handles the SP or FP case.
+      Reg = RegInfo->getPtrSizedFrameRegister(MF);
+    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+  }
   }
 }
 
@@ -15712,34 +15994,60 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
 static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget,
                                     SelectionDAG &DAG) {
   MachineFunction &MF = DAG.getMachineFunction();
+  const Function *Fn = MF.getFunction();
   SDLoc dl(Op);
   SDValue Chain = Op.getOperand(0);
 
+  assert(Subtarget->getFrameLowering()->hasFP(MF) &&
+         "using llvm.x86.seh.restoreframe requires a frame pointer");
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  MVT VT = TLI.getPointerTy();
+  MVT VT = TLI.getPointerTy(DAG.getDataLayout());
 
   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   unsigned FrameReg =
       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   unsigned SPReg = RegInfo->getStackRegister();
+  unsigned SlotSize = RegInfo->getSlotSize();
 
   // Get incoming EBP.
   SDValue IncomingEBP =
       DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
 
-  // Load [EBP-24] into SP.
-  SDValue SPAddr =
-      DAG.getNode(ISD::ADD, dl, VT, IncomingEBP, DAG.getConstant(-24, dl, VT));
+  // SP is saved in the first field of every registration node, so load
+  // [EBP-RegNodeSize] into SP.
+  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
+  SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP,
+                               DAG.getConstant(-RegNodeSize, dl, VT));
   SDValue NewSP =
       DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false,
                   false, VT.getScalarSizeInBits() / 8);
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP);
 
-  // FIXME: Restore the base pointer in case of stack realignment!
+  if (!RegInfo->needsStackRealignment(MF)) {
+    // Adjust EBP to point back to the original frame position.
+    SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP);
+    Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP);
+  } else {
+    assert(RegInfo->hasBasePointer(MF) &&
+           "functions with Win32 EH must use frame or base pointer register");
+
+    // Reload the base pointer (ESI) with the adjusted incoming EBP.
+    SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP);
+    Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP);
+
+    // Reload the spilled EBP value, now that the stack and base pointers are
+    // set up.
+    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    X86FI->setHasSEHFramePtrSave(true);
+    int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize);
+    X86FI->setSEHFramePtrSaveIndex(FI);
+    SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT),
+                                MachinePointerInfo(), false, false, false,
+                                VT.getScalarSizeInBits() / 8);
+    Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP);
+  }
 
-  // Adjust EBP to point back to the original frame position.
-  SDValue NewFP = recoverFramePointer(DAG, MF.getFunction(), IncomingEBP);
-  Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP);
   return Chain;
 }
 
@@ -15910,7 +16218,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDLoc dl(Op);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -15969,14 +16277,36 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned X86TargetLowering::getRegisterByName(const char* RegName,
-                                              EVT VT) const {
+unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              SelectionDAG &DAG) const {
+  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+  const MachineFunction &MF = DAG.getMachineFunction();
+
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("esp", X86::ESP)
                        .Case("rsp", X86::RSP)
+                       .Case("ebp", X86::EBP)
+                       .Case("rbp", X86::RBP)
                        .Default(0);
+
+  if (Reg == X86::EBP || Reg == X86::RBP) {
+    if (!TFI.hasFP(MF))
+      report_fatal_error("register " + StringRef(RegName) +
+                         " is allocatable: function has no frame pointer");
+#ifndef NDEBUG
+    else {
+      const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+      unsigned FrameReg =
+          RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+      assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
+             "Invalid Frame Register!");
+    }
+#endif
+  }
+
   if (Reg)
     return Reg;
+
   report_fatal_error("Invalid register name global variable");
 }
 
@@ -15992,7 +16322,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Handler   = Op.getOperand(2);
   SDLoc dl      (Op);
 
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
@@ -16211,7 +16541,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 
   // Save FP Control Word to stack slot
   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue StackSlot =
+      DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
 
   MachineMemOperand *MMO =
    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
@@ -16572,7 +16903,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
   }
 
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
-                                         getPointerTy());
+                                         getPointerTy(DAG.getDataLayout()));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(InChain)
@@ -16642,9 +16973,9 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
   // If we have a signed multiply but no PMULDQ fix up the high parts of a
   // unsigned multiply.
   if (IsSigned && !Subtarget->hasSSE41()) {
-    SDValue ShAmt =
-        DAG.getConstant(31, dl,
-                        DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
+    SDValue ShAmt = DAG.getConstant(
+        31, dl,
+        DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
@@ -16717,6 +17048,38 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
 
+  auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
+    assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
+    MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+    SDValue Ex = DAG.getBitcast(ExVT, R);
+
+    if (ShiftAmt >= 32) {
+      // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
+      SDValue Upper =
+          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
+      SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+                                                 ShiftAmt - 32, DAG);
+      if (VT == MVT::v2i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
+      if (VT == MVT::v4i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+                                  {9, 1, 11, 3, 13, 5, 15, 7});
+    } else {
+      // SRA upper i32, SHL whole i64 and select lower i32.
+      SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+                                                 ShiftAmt, DAG);
+      SDValue Lower =
+          getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
+      Lower = DAG.getBitcast(ExVT, Lower);
+      if (VT == MVT::v2i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
+      if (VT == MVT::v4i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+                                  {8, 1, 10, 3, 12, 5, 14, 7});
+    }
+    return DAG.getBitcast(VT, Ex);
+  };
+
   // Optimize shl/srl/sra with constant shift amount.
   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
@@ -16725,6 +17088,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
       if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
 
+      // i64 SRA needs to be performed as partial shifts.
+      if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+          Op.getOpcode() == ISD::SRA)
+        return ArithmeticShiftRight64(ShiftAmt);
+
       if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
         unsigned NumElts = VT.getVectorNumElements();
         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
@@ -16808,7 +17176,12 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
       if (ShAmt != ShiftAmt)
         return SDValue();
     }
-    return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+    if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+      return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+    if (Op.getOpcode() == ISD::SRA)
+      return ArithmeticShiftRight64(ShiftAmt);
   }
 
   return SDValue();
@@ -16890,7 +17263,9 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         if (Vals[j] != Amt.getOperand(i + j))
           return SDValue();
     }
-    return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
+
+    if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
+      return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
   }
   return SDValue();
 }
@@ -17042,6 +17417,53 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
     }
   }
 
+  // v4i32 Non Uniform Shifts.
+  // If the shift amount is constant we can shift each lane using the SSE2
+  // immediate shifts, else we need to zero-extend each lane to the lower i64
+  // and shift using the SSE2 variable shifts.
+  // The separate results can then be blended together.
+  if (VT == MVT::v4i32) {
+    unsigned Opc = Op.getOpcode();
+    SDValue Amt0, Amt1, Amt2, Amt3;
+    if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
+      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
+      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
+      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
+    } else {
+      // ISD::SHL is handled above but we include it here for completeness.
+      switch (Opc) {
+      default:
+        llvm_unreachable("Unknown target vector shift node");
+      case ISD::SHL:
+        Opc = X86ISD::VSHL;
+        break;
+      case ISD::SRL:
+        Opc = X86ISD::VSRL;
+        break;
+      case ISD::SRA:
+        Opc = X86ISD::VSRA;
+        break;
+      }
+      // The SSE2 shifts use the lower i64 as the same shift amount for
+      // all lanes and the upper i64 is ignored. These shuffle masks
+      // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
+      SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
+      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
+      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
+      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
+    }
+
+    SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
+    SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
+    SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
+    SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
+    SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
+    SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
+    return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
+  }
+
   if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) {
     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
     unsigned ShiftOpcode = Op->getOpcode();
@@ -17944,7 +18366,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
   // the results are returned via SRet in memory.
   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
+  SDValue Callee =
+      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = isF64
     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
@@ -18443,10 +18866,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::HSUB:               return "X86ISD::HSUB";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
-  case X86ISD::UMAX:               return "X86ISD::UMAX";
-  case X86ISD::UMIN:               return "X86ISD::UMIN";
-  case X86ISD::SMAX:               return "X86ISD::SMAX";
-  case X86ISD::SMIN:               return "X86ISD::SMIN";
   case X86ISD::ABS:                return "X86ISD::ABS";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
@@ -18456,6 +18875,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FMINC:              return "X86ISD::FMINC";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
+  case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
@@ -18478,6 +18899,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   case X86ISD::CVTDQ2PD:           return "X86ISD::CVTDQ2PD";
+  case X86ISD::CVTUDQ2PD:          return "X86ISD::CVTUDQ2PD";
   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   case X86ISD::VSHL:               return "X86ISD::VSHL";
@@ -18594,16 +19016,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::ADDS:               return "X86ISD::ADDS";
   case X86ISD::SUBS:               return "X86ISD::SUBS";
   case X86ISD::AVG:                return "X86ISD::AVG";
+  case X86ISD::MULHRS:             return "X86ISD::MULHRS";
   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
+  case X86ISD::FP_TO_SINT_RND:     return "X86ISD::FP_TO_SINT_RND";
+  case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
   }
   return nullptr;
 }
 
 // isLegalAddressingMode - Return true if the addressing mode represented
 // by AM is legal for this target, for a load/store of the specified type.
-bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              Type *Ty,
+bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
                                               unsigned AS) const {
   // X86 supports extremely general addressing modes.
   CodeModel::Model M = getTargetMachine().getCodeModel();
@@ -19555,7 +19980,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const TargetRegisterClass *AddrRegClass =
-    getRegClassFor(getPointerTy());
+      getRegClassFor(getPointerTy(MF->getDataLayout()));
 
   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
@@ -19750,7 +20175,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
 
   MemOpndSlot = CurOp;
 
-  MVT PVT = getPointerTy();
+  MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
 
@@ -19882,7 +20307,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
 
-  MVT PVT = getPointerTy();
+  MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
 
@@ -21377,7 +21802,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   // alignment is valid.
   unsigned Align = LN0->getAlignment();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
+  unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
       EltVT.getTypeForEVT(*DAG.getContext()));
 
   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
@@ -21513,14 +21938,15 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 
   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
     SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
-    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
+    auto &DL = DAG.getDataLayout();
+    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
       DAG.getConstant(0, dl, VecIdxTy));
     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
       DAG.getConstant(1, dl, VecIdxTy));
 
-    SDValue ShAmt = DAG.getConstant(32, dl,
-      DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
+    SDValue ShAmt = DAG.getConstant(
+        32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
@@ -21539,10 +21965,11 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     // Replace each use (extract) with a load of the appropriate element.
     for (unsigned i = 0; i < 4; ++i) {
       uint64_t Offset = EltSize * i;
-      SDValue OffsetVal = DAG.getConstant(Offset, dl, TLI.getPointerTy());
+      auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+      SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
 
-      SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
-                                       StackPtr, OffsetVal);
+      SDValue ScalarAddr =
+          DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
 
       // Load the scalar.
       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
@@ -21622,16 +22049,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
     default: break;
     case ISD::SETULT:
     case ISD::SETULE:
-      Opc = hasUnsigned ? X86ISD::UMIN : 0u; break;
+      Opc = hasUnsigned ? ISD::UMIN : 0; break;
     case ISD::SETUGT:
     case ISD::SETUGE:
-      Opc = hasUnsigned ? X86ISD::UMAX : 0u; break;
+      Opc = hasUnsigned ? ISD::UMAX : 0; break;
     case ISD::SETLT:
     case ISD::SETLE:
-      Opc = hasSigned ? X86ISD::SMIN : 0u; break;
+      Opc = hasSigned ? ISD::SMIN : 0; break;
     case ISD::SETGT:
     case ISD::SETGE:
-      Opc = hasSigned ? X86ISD::SMAX : 0u; break;
+      Opc = hasSigned ? ISD::SMAX : 0; break;
     }
   // Check for x CC y ? y : x -- a min/max with reversed arms.
   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
@@ -21640,16 +22067,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
     default: break;
     case ISD::SETULT:
     case ISD::SETULE:
-      Opc = hasUnsigned ? X86ISD::UMAX : 0u; break;
+      Opc = hasUnsigned ? ISD::UMAX : 0; break;
     case ISD::SETUGT:
     case ISD::SETUGE:
-      Opc = hasUnsigned ? X86ISD::UMIN : 0u; break;
+      Opc = hasUnsigned ? ISD::UMIN : 0; break;
     case ISD::SETLT:
     case ISD::SETLE:
-      Opc = hasSigned ? X86ISD::SMAX : 0u; break;
+      Opc = hasSigned ? ISD::SMAX : 0; break;
     case ISD::SETGT:
     case ISD::SETGE:
-      Opc = hasSigned ? X86ISD::SMIN : 0u; break;
+      Opc = hasSigned ? ISD::SMIN : 0; break;
     }
   }
 
@@ -22106,7 +22533,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // Check if the selector will be produced by CMPP*/PCMP*
         Cond.getOpcode() == ISD::SETCC &&
         // Check if SETCC has already been promoted
-        TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
+        TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
+            CondVT) {
       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
 
@@ -22826,7 +23254,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
       // We shift all of the values by one. In many cases we do not have
       // hardware support for this operation. This is better expressed as an ADD
       // of two values.
-      if (N1SplatC->getZExtValue() == 1)
+      if (N1SplatC->getAPIntValue() == 1)
         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
     }
 
@@ -23478,7 +23906,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
       return SDValue();
 
     SDValue Ptr = Ld->getBasePtr();
-    SDValue Increment = DAG.getConstant(16, dl, TLI.getPointerTy());
+    SDValue Increment =
+        DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
 
     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                   NumElems/2);
@@ -23687,7 +24116,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
 
-    SDValue Stride = DAG.getConstant(16, dl, TLI.getPointerTy());
+    SDValue Stride =
+        DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
     SDValue Ptr0 = St->getBasePtr();
     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
 
@@ -23760,8 +24190,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
     SmallVector<SDValue, 8> Chains;
-    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, dl,
-                                        TLI.getPointerTy());
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
     SDValue Ptr = St->getBasePtr();
 
     // Perform one or more big stores into memory.
@@ -24659,6 +25089,31 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
+                                        const X86Subtarget *Subtarget) {
+  SDValue Op0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT InVT = Op0.getValueType();
+  EVT InSVT = InVT.getScalarType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
+  // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
+  if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
+    SDLoc dl(N);
+    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                 InVT.getVectorNumElements());
+    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+
+    if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
+      return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
+
+    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+  }
+
+  return SDValue();
+}
+
 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                         const X86Subtarget *Subtarget) {
   // First try to optimize away the conversion entirely when it's
@@ -24913,6 +25368,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
+  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
@@ -25135,7 +25591,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
       AsmPieces.clear();
-      const std::string &ConstraintsStr = IA->getConstraintString();
+      StringRef ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
       if (clobbersFlagRegisters(AsmPieces))
@@ -25149,7 +25605,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
       AsmPieces.clear();
-      const std::string &ConstraintsStr = IA->getConstraintString();
+      StringRef ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
       if (clobbersFlagRegisters(AsmPieces))
@@ -25176,7 +25632,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 X86TargetLowering::ConstraintType
-X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+X86TargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'R':
@@ -25508,7 +25964,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
 std::pair<unsigned, const TargetRegisterClass *>
 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                const std::string &Constraint,
+                                                StringRef Constraint,
                                                 MVT VT) const {
   // First, see if this is a constraint that directly corresponds to an LLVM
   // register class.
@@ -25717,8 +26173,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   return Res;
 }
 
-int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                            Type *Ty,
+int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
+                                            const AddrMode &AM, Type *Ty,
                                             unsigned AS) const {
   // Scaling factors are not free at all.
   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
@@ -25738,7 +26194,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
   // E.g., on Haswell:
   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
-  if (isLegalAddressingMode(AM, Ty, AS))
+  if (isLegalAddressingMode(DL, AM, Ty, AS))
     // Scale represents reg2 * scale, thus account for 1
     // as soon as we use a second register.
     return AM.Scale != 0;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 17660891635c..723d5304495c 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -233,12 +233,6 @@ namespace llvm {
       /// Floating point horizontal sub.
       FHSUB,
 
-      /// Unsigned integer max and min.
-      UMAX, UMIN,
-
-      /// Signed integer max and min.
-      SMAX, SMIN,
-
       // Integer absolute value
       ABS,
 
@@ -298,8 +292,8 @@ namespace llvm {
       // Vector FP round.
       VFPROUND,
 
-      // Vector signed integer to double.
-      CVTDQ2PD,
+      // Vector signed/unsigned integer to double.
+      CVTDQ2PD, CVTUDQ2PD,
 
       // 128-bit vector logical left / right shift
       VSHLDQ, VSRLDQ,
@@ -400,10 +394,15 @@ namespace llvm {
       VINSERT,
       VEXTRACT,
 
+      /// SSE4A Extraction and Insertion.
+      EXTRQI, INSERTQI,
+
       // Vector multiply packed unsigned doubleword integers
       PMULUDQ,
       // Vector multiply packed signed doubleword integers
       PMULDQ,
+      // Vector Multiply Packed UnsignedIntegers with Round and Scale
+      MULHRS,
 
       // FMA nodes
       FMADD,
@@ -429,6 +428,9 @@ namespace llvm {
       //with rounding mode
       SINT_TO_FP_RND,
       UINT_TO_FP_RND,
+
+      // Vector float/double to signed/unsigned integer.
+      FP_TO_SINT_RND, FP_TO_UINT_RND,
       // Save xmm argument registers to the stack, according to %al. An operator
       // is needed so that this can be expanded with control flow.
       VASTART_SAVE_XMM_REGS,
@@ -599,7 +601,9 @@ namespace llvm {
     unsigned getJumpTableEncoding() const override;
     bool useSoftFloat() const override;
 
-    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; }
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i8;
+    }
 
     const MCExpr *
     LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
@@ -617,7 +621,8 @@ namespace llvm {
     /// function arguments in the caller parameter area. For X86, aggregates
     /// that contains are placed at 16-byte boundaries while the rest are at
     /// 4-byte boundaries.
-    unsigned getByValTypeAlignment(Type *Ty) const override;
+    unsigned getByValTypeAlignment(Type *Ty,
+                                   const DataLayout &DL) const override;
 
     /// Returns the target specific optimal type for load
     /// and store operations as a result of memset, memcpy, and memmove
@@ -685,7 +690,8 @@ namespace llvm {
     bool isCheapToSpeculateCtlz() const override;
 
     /// Return the value type to use for ISD::SETCC.
-    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
 
     /// Determine which of the bits specified in Mask are known to be either
     /// zero or one and return them in the KnownZero/KnownOne bitsets.
@@ -707,8 +713,7 @@ namespace llvm {
 
     bool ExpandInlineAsm(CallInst *CI) const override;
 
-    ConstraintType
-      getConstraintType(const std::string &Constraint) const override;
+    ConstraintType getConstraintType(StringRef Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
@@ -726,8 +731,8 @@ namespace llvm {
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
 
-    unsigned getInlineAsmMemConstraint(
-        const std::string &ConstraintCode) const override {
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "i")
         return InlineAsm::Constraint_i;
       else if (ConstraintCode == "o")
@@ -745,13 +750,12 @@ namespace llvm {
     /// error, this returns a register number of 0.
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     /// Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                               unsigned AS) const override;
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
 
     /// Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
@@ -770,7 +774,7 @@ namespace llvm {
     /// of the specified type.
     /// If the AM is supported, the return value must be >= 0.
     /// If the AM is not supported, it returns a negative value.
-    int getScalingFactorCost(const AddrMode &AM, Type *Ty,
+    int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
     bool isVectorShiftByScalarCheap(Type *Ty) const override;
@@ -872,7 +876,8 @@ namespace llvm {
       return nullptr; // nothing to do, move along.
     }
 
-    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
 
     /// This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index b309b8210851..faa91500b181 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -3136,6 +3136,12 @@ defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul,
                                    SSE_INTALU_ITINS_P, HasBWI, 1>;
 defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul,
                                    SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
+defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulh", mulhs, SSE_INTALU_ITINS_P,
+                                    HasBWI, 1>;
+defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhu", mulhu, SSE_INTMUL_ITINS_P, 
+                                    HasBWI, 1>;
+defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrs", X86mulhrs, SSE_INTMUL_ITINS_P, 
+                                    HasBWI, 1>, T8PD;
 defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
                                    
@@ -3230,32 +3236,32 @@ let Predicates = [HasBWI] in {
   defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W;
 }
 
-defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", X86smax,
+defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", smax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", X86smax,
+defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", smax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", X86smax,
+defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax,
                                      SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
-defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", X86umax,
+defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", umax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", X86umax,
+defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", umax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", X86umax,
+defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax,
                                      SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
-defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", X86smin,
+defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", smin,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", X86smin,
+defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", smin,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", X86smin,
+defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin,
                                      SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
-defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", X86umin,
+defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", umin,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", X86umin,
+defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", umin,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", X86umin,
+defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
                                      SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
 //===----------------------------------------------------------------------===//
@@ -4035,7 +4041,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
               OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
               !strconcat("$src2, ${src3}", _.BroadcastStr ),
               (OpNode _.RC:$src1,
-               _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,	
+               _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
               AVX512FMA3Base, EVEX_B;
   }
 }
@@ -4394,16 +4400,16 @@ def : Pat<(f64 (sint_to_fp GR32:$src)),
 def : Pat<(f64 (sint_to_fp GR64:$src)),
           (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
 
-defm VCVTUSI2SSZ   : avx512_vcvtsi_common<0x7B, X86SuintToFpRnd, GR32,
+defm VCVTUSI2SSZ   : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR32,
                                   v4f32x_info, i32mem, loadi32,
                                   "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86SuintToFpRnd, GR64,
+defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
                                   v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">,
                                   XS, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, X86SuintToFpRnd, GR32, v2f64x_info,
+defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, X86UintToFpRnd, GR32, v2f64x_info,
                                   i32mem, loadi32, "cvtusi2sd{l}">,
                                   XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86SuintToFpRnd, GR64,
+defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
                                   v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
                                   XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
@@ -4604,117 +4610,389 @@ def : Pat<(extloadf32 addr:$src),
 def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>,
            Requires<[HasAVX512]>;
 
-multiclass avx512_vcvt_fp_with_rc<bits<8> opc, string asm, RegisterClass SrcRC,
-               RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag,
-               X86MemOperand x86memop, ValueType OpVT, ValueType InVT,
-               Domain d> {
-let hasSideEffects = 0 in {
-  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst,
-                (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
-  def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
-              !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
-              [], d>, EVEX, EVEX_B, EVEX_RC;
-  let mayLoad = 1 in
-  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst,
-                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
-} // hasSideEffects = 0
+//===----------------------------------------------------------------------===//
+// AVX-512  Vector convert from signed/unsigned integer to float/double
+//          and from float/double to signed/unsigned integer
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNode,
+                         string Broadcast = _.BroadcastStr,
+                         string Alias = ""> {
+
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _Src.RC:$src), OpcodeStr, "$src", "$src",
+                         (_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX;
+
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _Src.MemOp:$src), OpcodeStr#Alias, "$src", "$src",
+                         (_.VT (OpNode (_Src.VT
+                             (bitconvert (_Src.LdFrag addr:$src)))))>, EVEX;
+
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _Src.MemOp:$src), OpcodeStr,
+                         "${src}"##Broadcast, "${src}"##Broadcast,
+                         (_.VT (OpNode (_Src.VT
+                                  (X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
+                            ))>, EVEX, EVEX_B;
+}
+// Coversion with SAE - suppress all exceptions
+multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _Src.RC:$src), OpcodeStr,
+                        "{sae}, $src", "$src, {sae}",
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src),
+                               (i32 FROUND_NO_EXC)))>,
+                        EVEX, EVEX_B;
 }
 
-multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC,
-               RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag,
-               X86MemOperand x86memop, ValueType OpVT, ValueType InVT,
-               Domain d> {
-let hasSideEffects = 0 in {
-  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst,
-                (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
-  let mayLoad = 1 in
-  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst,
-                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
-} // hasSideEffects = 0
+// Conversion with rounding control (RC)
+multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
+                        "$rc, $src", "$src, $rc",
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>,
+                        EVEX, EVEX_B, EVEX_RC;
 }
 
-defm VCVTPD2PSZ : avx512_vcvt_fp_with_rc<0x5A, "vcvtpd2ps", VR512, VR256X, fround,
-                                loadv8f64, f512mem, v8f32, v8f64,
-                                SSEPackedSingle>, EVEX_V512, VEX_W, PD,
-                                EVEX_CD8<64, CD8VF>;
+// Extend Float to Double
+multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fextend>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
+                                X86vfpextRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
+                               X86vfpext, "{1to2}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fextend>,
+                                     EVEX_V256;
+  }
+}
+
+// Truncate Double to Float
+multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fround>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
+                               X86vfproundRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
+                               X86vfpround, "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fround,
+                               "{1to4}", "{y}">, EVEX_V256;
+  }
+}
+
+defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps">,
+                                  VEX_W, PD, EVEX_CD8<64, CD8VF>;
+defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd">,
+                                  PS, EVEX_CD8<32, CD8VH>;
 
-defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend,
-                                loadv4f64, f256mem, v8f64, v8f32,
-                                SSEPackedDouble>, EVEX_V512, PS,
-                                EVEX_CD8<32, CD8VH>;
 def : Pat<(v8f64 (extloadv8f32 addr:$src)),
             (VCVTPS2PDZrm addr:$src)>;
 
-def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src),
-                   (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), (i32 FROUND_CURRENT))),
-          (VCVTPD2PSZrr VR512:$src)>;
+let Predicates = [HasVLX] in {
+  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
+              (VCVTPS2PDZ256rm addr:$src)>;
+}
 
-def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src),
-                   (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), imm:$rc)),
-          (VCVTPD2PSZrrb VR512:$src, imm:$rc)>;
+// Convert Signed/Unsigned Doubleword to Double
+multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           SDNode OpNode128> {
+  // No rounding in this op
+  let Predicates = [HasAVX512] in
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode>,
+                                     EVEX_V512;
 
-//===----------------------------------------------------------------------===//
-// AVX-512  Vector convert from sign integer to float/double
-//===----------------------------------------------------------------------===//
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
+                                     OpNode128, "{1to2}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
 
-defm VCVTDQ2PSZ : avx512_vcvt_fp_with_rc<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp,
-                                loadv8i64, i512mem, v16f32, v16i32,
-                                SSEPackedSingle>, EVEX_V512, PS,
-                                EVEX_CD8<32, CD8VF>;
+// Convert Signed/Unsigned Doubleword to Float
+multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
+                               OpNodeRnd>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode>,
+                                     EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
+                                OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
+                                     EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Doubleword
+multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
+                                OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
+                                     EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Double to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
+                                OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+    // memory forms of these instructions in Asm Parcer. They have the same
+    // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+    // due to the same reason.
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
+                               "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+                               "{1to4}", "{y}">, EVEX_V256;
+  }
+}
+
+// Convert Double to Signed/Unsigned Doubleword
+multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+    // memory forms of these instructions in Asm Parcer. They have the same
+    // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+    // due to the same reason.
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
+                               "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+                               "{1to4}", "{y}">, EVEX_V256;
+  }
+}
+
+// Convert Double to Signed/Unsigned Quardword
+multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
+                               EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
 
-defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp,
-                                loadv4i64, i256mem, v8f64, v8i32,
-                                SSEPackedDouble>, EVEX_V512, XS,
+// Convert Double to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
+                               EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Signed/Unsigned Quardword to Double
+multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode>,
+                               EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Quardword
+multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    // Explicitly specified broadcast string, since we take only 2 elements
+    // from v4f32x_info source
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+                               "{1to2}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    // Explicitly specified broadcast string, since we take only 2 elements
+    // from v4f32x_info source
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+                               "{1to2}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Signed/Unsigned Quardword to Float
+multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+    // memory forms of these instructions in Asm Parcer. They have the same
+    // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+    // due to the same reason.
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode,
+                               "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
+                               "{1to4}", "{y}">, EVEX_V256;
+  }
+}
+
+defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86cvtdq2pd>, XS,
                                 EVEX_CD8<32, CD8VH>;
 
-defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint,
-                                 loadv16f32, f512mem, v16i32, v16f32,
-                                 SSEPackedSingle>, EVEX_V512, XS,
+defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
+                                X86VSintToFpRnd>,
+                                PS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
+                                X86VFpToSintRnd>,
+                                XS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint,
+                                 X86VFpToSintRnd>,
+                                 PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
+                                 X86VFpToUintRnd>, PS,
                                  EVEX_CD8<32, CD8VF>;
 
-defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint,
-                                 loadv8f64, f512mem, v8i32, v8f64,
-                                 SSEPackedDouble>, EVEX_V512, PD, VEX_W,
+defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
+                                 X86VFpToUintRnd>, PS, VEX_W,
                                  EVEX_CD8<64, CD8VF>;
 
-defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint,
-                                 loadv16f32, f512mem, v16i32, v16f32,
-                                 SSEPackedSingle>, EVEX_V512, PS,
+defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86cvtudq2pd>,
+                                 XS, EVEX_CD8<32, CD8VH>;
+
+defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
+                                 X86VUintToFpRnd>, XD,
                                  EVEX_CD8<32, CD8VF>;
 
-// cvttps2udq (src, 0, mask-all-ones, sae-current)
-def : Pat<(v16i32 (int_x86_avx512_mask_cvttps2udq_512 (v16f32 VR512:$src),
-                   (v16i32 immAllZerosV), (i16 -1), FROUND_CURRENT)),
-          (VCVTTPS2UDQZrr VR512:$src)>;
+defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtps2Int,
+                                 X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VF>;
 
-defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint,
-                                 loadv8f64, f512mem, v8i32, v8f64,
-                                 SSEPackedDouble>, EVEX_V512, PS, VEX_W,
+defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtpd2Int,
+                                 X86cvtpd2IntRnd>, XD, VEX_W,
                                  EVEX_CD8<64, CD8VF>;
 
-// cvttpd2udq (src, 0, mask-all-ones, sae-current)
-def : Pat<(v8i32 (int_x86_avx512_mask_cvttpd2udq_512 (v8f64 VR512:$src),
-                   (v8i32 immAllZerosV), (i8 -1), FROUND_CURRENT)),
-          (VCVTTPD2UDQZrr VR512:$src)>;
+defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtps2UInt,
+                                 X86cvtps2UIntRnd>,
+                                 PS, EVEX_CD8<32, CD8VF>;
+defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtpd2UInt,
+                                 X86cvtpd2UIntRnd>, VEX_W,
+                                 PS, EVEX_CD8<64, CD8VF>;
 
-defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp,
-                                 loadv4i64, f256mem, v8f64, v8i32,
-                                 SSEPackedDouble>, EVEX_V512, XS,
-                                 EVEX_CD8<32, CD8VH>;
+defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtpd2Int,
+                                 X86cvtpd2IntRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
 
-defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp,
-                                 loadv16i32, f512mem, v16f32, v16i32,
-                                 SSEPackedSingle>, EVEX_V512, XD,
-                                 EVEX_CD8<32, CD8VF>;
+defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtps2Int,
+                                 X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtpd2UInt,
+                                 X86cvtpd2UIntRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtps2UInt,
+                                 X86cvtps2UIntRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint,
+                                 X86VFpToSlongRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint,
+                                 X86VFpToSlongRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint,
+                                 X86VFpToUlongRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint,
+                                 X86VFpToUlongRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
+                            X86VSlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
 
+defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
+                            X86VUlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp,
+                            X86VSlongToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp,
+                            X86VUlongToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;
+
+let Predicates = [NoVLX] in {
 def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
           (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
@@ -4734,67 +5012,8 @@ def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
 def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
           (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
            (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>;
-
-def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src),
-                   (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
-          (VCVTDQ2PSZrrb VR512:$src, imm:$rc)>;
-def : Pat<(v8f64 (int_x86_avx512_mask_cvtdq2pd_512 (v8i32 VR256X:$src),
-                   (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
-          (VCVTDQ2PDZrr VR256X:$src)>;
-def : Pat<(v16f32 (int_x86_avx512_mask_cvtudq2ps_512 (v16i32 VR512:$src),
-                   (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
-          (VCVTUDQ2PSZrrb VR512:$src, imm:$rc)>;
-def : Pat<(v8f64 (int_x86_avx512_mask_cvtudq2pd_512 (v8i32 VR256X:$src),
-                   (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
-          (VCVTUDQ2PDZrr VR256X:$src)>;
-
-multiclass avx512_vcvt_fp2int<bits<8> opc, string asm, RegisterClass SrcRC,
-               RegisterClass DstRC, PatFrag mem_frag,
-               X86MemOperand x86memop, Domain d> {
-let hasSideEffects = 0 in {
-  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [], d>, EVEX;
-  def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
-              !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
-              [], d>, EVEX, EVEX_B, EVEX_RC;
-  let mayLoad = 1 in
-  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [], d>, EVEX;
-} // hasSideEffects = 0
 }
 
-defm VCVTPS2DQZ : avx512_vcvt_fp2int<0x5B, "vcvtps2dq", VR512, VR512,
-                                 loadv16f32, f512mem, SSEPackedSingle>, PD,
-                                 EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VCVTPD2DQZ : avx512_vcvt_fp2int<0xE6, "vcvtpd2dq", VR512, VR256X,
-                                 loadv8f64, f512mem, SSEPackedDouble>, XD, VEX_W,
-                                 EVEX_V512, EVEX_CD8<64, CD8VF>;
-
-def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2dq_512 (v16f32 VR512:$src),
-                    (v16i32 immAllZerosV), (i16 -1), imm:$rc)),
-           (VCVTPS2DQZrrb VR512:$src, imm:$rc)>;
-
-def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2dq_512 (v8f64 VR512:$src),
-                    (v8i32 immAllZerosV), (i8 -1), imm:$rc)),
-           (VCVTPD2DQZrrb VR512:$src, imm:$rc)>;
-
-defm VCVTPS2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtps2udq", VR512, VR512,
-                                 loadv16f32, f512mem, SSEPackedSingle>,
-                                 PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VCVTPD2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtpd2udq", VR512, VR256X,
-                                 loadv8f64, f512mem, SSEPackedDouble>, VEX_W,
-                                 PS, EVEX_V512, EVEX_CD8<64, CD8VF>;
-
-def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2udq_512 (v16f32 VR512:$src),
-                    (v16i32 immAllZerosV), (i16 -1), imm:$rc)),
-           (VCVTPS2UDQZrrb VR512:$src, imm:$rc)>;
-
-def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2udq_512 (v8f64 VR512:$src),
-                    (v8i32 immAllZerosV), (i8 -1), imm:$rc)),
-           (VCVTPD2UDQZrrb VR512:$src, imm:$rc)>;
-
 let Predicates = [HasAVX512] in {
   def : Pat<(v8f32 (fround (loadv8f64 addr:$src))),
             (VCVTPD2PSZrm addr:$src)>;
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index 6ab961f04ecf..4cd5563ce727 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -105,14 +105,16 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in
   // jecxz.
   let Uses = [CX] in
     def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize16;
+                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize16,
+                        Requires<[Not64BitMode]>;
   let Uses = [ECX] in
     def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
                         "jecxz\t$dst", [], IIC_JCXZ>, AdSize32;
 
   let Uses = [RCX] in
     def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                           "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64;
+                         "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64,
+                         Requires<[In64BitMode]>;
 }
 
 // Indirect branches
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index fe245c3a7e38..1f61ffa84e9a 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -39,11 +39,6 @@ def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
                                        SDTCisFP<1>, SDTCisVT<3, i8>,
                                        SDTCisVec<1>]>;
 
-def X86umin    : SDNode<"X86ISD::UMIN",      SDTIntBinOp>;
-def X86umax    : SDNode<"X86ISD::UMAX",      SDTIntBinOp>;
-def X86smin    : SDNode<"X86ISD::SMIN",      SDTIntBinOp>;
-def X86smax    : SDNode<"X86ISD::SMAX",      SDTIntBinOp>;
-
 def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
 def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
 
@@ -75,6 +70,9 @@ def X86cmps    : SDNode<"X86ISD::FSETCC",     SDTX86Cmps>;
 def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD",
                  SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
                                       SDTCisVT<1, v4i32>]>>;
+def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD",
+                 SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
+                                      SDTCisVT<1, v4i32>]>>;
 def X86pshufb  : SDNode<"X86ISD::PSHUFB",
                  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
@@ -187,6 +185,7 @@ def X86addus   : SDNode<"X86ISD::ADDUS", SDTIntBinOp>;
 def X86subus   : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
 def X86adds    : SDNode<"X86ISD::ADDS", SDTIntBinOp>;
 def X86subs    : SDNode<"X86ISD::SUBS", SDTIntBinOp>;
+def X86mulhrs  : SDNode<"X86ISD::MULHRS" , SDTIntBinOp>;
 def X86avg     : SDNode<"X86ISD::AVG" , SDTIntBinOp>;
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
@@ -208,6 +207,14 @@ def X86pmuldq  : SDNode<"X86ISD::PMULDQ",
                          SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                        SDTCisSameAs<1,2>]>>;
 
+def X86extrqi : SDNode<"X86ISD::EXTRQI",
+                  SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+                                       SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>;
+def X86insertqi : SDNode<"X86ISD::INSERTQI",
+                    SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+                                         SDTCisSameAs<1,2>, SDTCisVT<3, i8>,
+                                         SDTCisVT<4, i8>]>>;
+
 // Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
 // translated into one of the target nodes below during lowering.
 // Note: this is a work in progress...
@@ -357,8 +364,70 @@ def X86expand  : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
 def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
                                SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisInt<3>]>;
 
-def X86SintToFpRnd   : SDNode<"X86ISD::SINT_TO_FP_RND",  SDTintToFPRound>;
-def X86SuintToFpRnd  : SDNode<"X86ISD::UINT_TO_FP_RND",  SDTintToFPRound>;
+def SDTDoubleToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
+def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>;
+
+def SDTDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
+def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>;
+
+def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<0>, SDTCVecEltisVT<1, i32>,
+                                           SDTCisInt<2>]>;
+def SDTVlongToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<0>, SDTCVecEltisVT<1, i64>,
+                                           SDTCisInt<2>]>;
+
+def SDTVFPToIntRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<1>, SDTCVecEltisVT<0, i32>,
+                                           SDTCisInt<2>]>;
+def SDTVFPToLongRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<1>, SDTCVecEltisVT<0, i64>,
+                                           SDTCisInt<2>]>;
+
+// Scalar
+def X86SintToFpRnd  : SDNode<"X86ISD::SINT_TO_FP_RND",  SDTintToFPRound>;
+def X86UintToFpRnd  : SDNode<"X86ISD::UINT_TO_FP_RND",  SDTintToFPRound>;
+
+// Vector with rounding mode
+
+// cvtt fp-to-int staff
+def X86VFpToSintRnd   : SDNode<"ISD::FP_TO_SINT",  SDTVFPToIntRound>;
+def X86VFpToUintRnd   : SDNode<"ISD::FP_TO_UINT",  SDTVFPToIntRound>;
+def X86VFpToSlongRnd  : SDNode<"ISD::FP_TO_SINT",  SDTVFPToLongRound>;
+def X86VFpToUlongRnd  : SDNode<"ISD::FP_TO_UINT",  SDTVFPToLongRound>;
+
+def X86VSintToFpRnd   : SDNode<"ISD::SINT_TO_FP",  SDTVintToFPRound>;
+def X86VUintToFpRnd   : SDNode<"ISD::UINT_TO_FP",  SDTVintToFPRound>;
+def X86VSlongToFpRnd  : SDNode<"ISD::SINT_TO_FP",  SDTVlongToFPRound>;
+def X86VUlongToFpRnd  : SDNode<"ISD::UINT_TO_FP",  SDTVlongToFPRound>;
+
+// cvt fp-to-int staff
+def X86cvtps2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTFloatToIntRnd>;
+def X86cvtps2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTFloatToIntRnd>;
+def X86cvtpd2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTDoubleToIntRnd>;
+def X86cvtpd2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTDoubleToIntRnd>;
+
+// Vector without rounding mode
+def X86cvtps2Int      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTFloatToInt>;
+def X86cvtps2UInt     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTFloatToInt>;
+def X86cvtpd2Int      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTDoubleToInt>;
+def X86cvtpd2UInt     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTDoubleToInt>;
+
+def X86vfpextRnd  : SDNode<"X86ISD::VFPEXT",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCisOpSmallerThanOp<1, 0>,
+                                             SDTCisInt<2>]>>;
+def X86vfproundRnd: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCVecEltisVT<0, f32>,
+                                             SDTCVecEltisVT<1, f64>,
+                                             SDTCisInt<2>]>>;
 
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index b92ba99fb100..786150760b93 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -269,14 +269,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::XOR8rr,      X86::XOR8mr,     0 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2Addr); i != e; ++i) {
-    unsigned RegOp = MemoryFoldTable2Addr[i].RegOp;
-    unsigned MemOp = MemoryFoldTable2Addr[i].MemOp;
-    unsigned Flags = MemoryFoldTable2Addr[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
     AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                   // Index 0, folded load and store, no alignment requirement.
-                  Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+                  Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
@@ -424,12 +421,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VCVTPS2PHYrr,       X86::VCVTPS2PHYmr,     TB_FOLDED_STORE }
   };
 
-  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable0); i != e; ++i) {
-    unsigned RegOp      = MemoryFoldTable0[i].RegOp;
-    unsigned MemOp      = MemoryFoldTable0[i].MemOp;
-    unsigned Flags      = MemoryFoldTable0[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
     AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
-                  RegOp, MemOp, TB_INDEX_0 | Flags);
+                  Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
@@ -862,14 +856,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable1); i != e; ++i) {
-    unsigned RegOp = MemoryFoldTable1[i].RegOp;
-    unsigned MemOp = MemoryFoldTable1[i].MemOp;
-    unsigned Flags = MemoryFoldTable1[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
     AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                   // Index 1, folded load
-                  Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
+                  Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
@@ -1116,6 +1107,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
     { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
     { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
+    { X86::ROUNDSDr,        X86::ROUNDSDm,      0 },
+    { X86::ROUNDSSr,        X86::ROUNDSSm,      0 },
     { X86::SBB32rr,         X86::SBB32rm,       0 },
     { X86::SBB64rr,         X86::SBB64rm,       0 },
     { X86::SHUFPDrri,       X86::SHUFPDrmi,     TB_ALIGN_16 },
@@ -1412,6 +1405,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLQDQrr,     X86::VPUNPCKLQDQrm,      0 },
     { X86::VPUNPCKLWDrr,      X86::VPUNPCKLWDrm,       0 },
     { X86::VPXORrr,           X86::VPXORrm,            0 },
+    { X86::VROUNDSDr,         X86::VROUNDSDm,          0 },
+    { X86::VROUNDSSr,         X86::VROUNDSSm,          0 },
     { X86::VSHUFPDrri,        X86::VSHUFPDrmi,         0 },
     { X86::VSHUFPSrri,        X86::VSHUFPSrmi,         0 },
     { X86::VSUBPDrr,          X86::VSUBPDrm,           0 },
@@ -1733,14 +1728,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2); i != e; ++i) {
-    unsigned RegOp = MemoryFoldTable2[i].RegOp;
-    unsigned MemOp = MemoryFoldTable2[i].MemOp;
-    unsigned Flags = MemoryFoldTable2[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
     AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                   // Index 2, folded load
-                  Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+                  Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
@@ -1949,14 +1941,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMAXPDZ128rrkz,        X86::VMAXPDZ128rmkz,        0 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable3); i != e; ++i) {
-    unsigned RegOp = MemoryFoldTable3[i].RegOp;
-    unsigned MemOp = MemoryFoldTable3[i].MemOp;
-    unsigned Flags = MemoryFoldTable3[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
     AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                   // Index 3, folded load
-                  Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+                  Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
@@ -2001,14 +1990,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMAXPDZ128rrk,      X86::VMAXPDZ128rmk,        0 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable4); i != e; ++i) {
-    unsigned RegOp = MemoryFoldTable4[i].RegOp;
-    unsigned MemOp = MemoryFoldTable4[i].MemOp;
-    unsigned Flags = MemoryFoldTable4[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
     AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                   // Index 4, folded load
-                  Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
+                  Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
   }
 }
 
@@ -3820,7 +3806,7 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
                                                X86::MOVPQIto64rr);
     if (X86::VR64RegClass.contains(SrcReg))
       // Copy from a VR64 register to a GR64 register.
-      return X86::MOVSDto64rr;
+      return X86::MMX_MOVD64from64rr;
   } else if (X86::GR64RegClass.contains(SrcReg)) {
     // Copy from a GR64 register to a VR128 register.
     if (X86::VR128XRegClass.contains(DestReg))
@@ -3828,7 +3814,7 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
                                                X86::MOV64toPQIrr);
     // Copy from a GR64 register to a VR64 register.
     if (X86::VR64RegClass.contains(DestReg))
-      return X86::MOV64toSDrr;
+      return X86::MMX_MOVD64to64rr;
   }
 
   // SrcReg(FR32) -> DestReg(GR32)
@@ -6413,22 +6399,40 @@ static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) {
       hasVirtualRegDefsInBasicBlock(*MI1, MBB) &&
       MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
     return true;
-  
+
   return false;
 }
 
+// TODO: There are many more machine instruction opcodes to match:
+//       1. Other data types (integer, vectors)
+//       2. Other math / logic operations (and, or)
+static bool isAssociativeAndCommutative(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::ADDSDrr:
+  case X86::ADDSSrr:
+  case X86::VADDSDrr:
+  case X86::VADDSSrr:
+  case X86::MULSDrr:
+  case X86::MULSSrr:
+  case X86::VMULSDrr:
+  case X86::VMULSSrr:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// Return true if the input instruction is part of a chain of dependent ops
 /// that are suitable for reassociation, otherwise return false.
 /// If the instruction's operands must be commuted to have a previous
 /// instruction of the same type define the first source operand, Commuted will
 /// be set to true.
-static bool isReassocCandidate(const MachineInstr &Inst, unsigned AssocOpcode,
-                               bool &Commuted) {
-  // 1. The instruction must have the correct type.
+static bool isReassocCandidate(const MachineInstr &Inst, bool &Commuted) {
+  // 1. The operation must be associative and commutative.
   // 2. The instruction must have virtual register definitions for its
   //    operands in the same basic block.
-  // 3. The instruction must have a reassociatable sibling.
-  if (Inst.getOpcode() == AssocOpcode &&
+  // 3. The instruction must have a reassociable sibling.
+  if (isAssociativeAndCommutative(Inst.getOpcode()) &&
       hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) &&
       hasReassocSibling(Inst, Commuted))
     return true;
@@ -6455,14 +6459,8 @@ bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root,
   //   B = A op X (Prev)
   //   C = B op Y (Root)
 
-  // TODO: There are many more associative instruction types to match:
-  //       1. Other forms of scalar FP add (non-AVX)
-  //       2. Other data types (double, integer, vectors)
-  //       3. Other math / logic operations (mul, and, or)
-  unsigned AssocOpcode = X86::VADDSSrr;
-
-  bool Commute = false;
-  if (isReassocCandidate(Root, AssocOpcode, Commute)) {
+  bool Commute;
+  if (isReassocCandidate(Root, Commute)) {
     // We found a sequence of instructions that may be suitable for a
     // reassociation of operands to increase ILP. Specify each commutation
     // possibility for the Prev instruction in the sequence and let the
@@ -6512,7 +6510,7 @@ static void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
   MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]);
   MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]);
   MachineOperand &OpC = Root.getOperand(0);
-  
+
   unsigned RegA = OpA.getReg();
   unsigned RegB = OpB.getReg();
   unsigned RegX = OpX.getReg();
@@ -6547,7 +6545,7 @@ static void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
       .addReg(RegX, getKillRegState(KillX))
       .addReg(RegY, getKillRegState(KillY));
   InsInstrs.push_back(MIB1);
-  
+
   MachineInstrBuilder MIB2 =
     BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC)
       .addReg(RegA, getKillRegState(KillA))
@@ -6579,7 +6577,7 @@ void X86InstrInfo::genAlternativeCodeSequence(
       Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
   }
   assert(Prev && "Unknown pattern for machine combiner");
-  
+
   reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
   return;
 }
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 6f38cb8eaf33..52bab9c79b45 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -194,7 +194,7 @@ def X86rdpmc   : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
 def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
 def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
 
-def X86RecoverFrameAlloc : SDNode<"ISD::FRAME_ALLOC_RECOVER",
+def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER",
                                   SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                        SDTCisInt<1>]>>;
 
@@ -1028,14 +1028,13 @@ def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
                  IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
 
 def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
-                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16,
-                   Requires<[Not64BitMode]>;
+                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+
 def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
                    "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
                    Requires<[Not64BitMode]>;
-def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
-                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16,
-                   Requires<[Not64BitMode]>;
 def PUSHi32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
                    "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
                    Requires<[Not64BitMode]>;
@@ -1081,9 +1080,6 @@ let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
     SchedRW = [WriteStore] in {
 def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
                     "push{q}\t$imm", [], IIC_PUSH_IMM>, Requires<[In64BitMode]>;
-def PUSH64i16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
-                    "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16,
-                    Requires<[In64BitMode]>;
 def PUSH64i32  : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
                     "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
                     Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 2a896dfe8aa8..a5ff9edf05a3 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -4035,13 +4035,13 @@ defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
                              SSE_INTALU_ITINS_P, 0>;
 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
                              SSE_INTALU_ITINS_P, 0>;
-defm PMINUB  : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8,
+defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
                              SSE_INTALU_ITINS_P, 1>;
-defm PMINSW  : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16,
+defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
                              SSE_INTALU_ITINS_P, 1>;
-defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8,
+defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
                              SSE_INTALU_ITINS_P, 1>;
-defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16,
+defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
                              SSE_INTALU_ITINS_P, 1>;
 
 // Intrinsic forms
@@ -6834,29 +6834,28 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-  let isCommutable = 0 in
-  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
+  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
+  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
+  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
+  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
+  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
+  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
+  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
+  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
   defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
@@ -6865,29 +6864,28 @@ let Predicates = [HasAVX, NoVLX] in {
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
-  let isCommutable = 0 in
-  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
+  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
+  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
+  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
+  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
+  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
+  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
+  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
+  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
   defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
@@ -6896,22 +6894,21 @@ let Predicates = [HasAVX2, NoVLX] in {
 }
 
 let Constraints = "$src1 = $dst" in {
-  let isCommutable = 0 in
-  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
+  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
+  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
+  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
+  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
+  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
+  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
+  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
+  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMULDQ   : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
                                   VR128, memopv2i64, i128mem,
@@ -7773,7 +7770,7 @@ let Constraints = "$src = $dst" in {
 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
                  (ins VR128:$src, u8imm:$len, u8imm:$idx),
                  "extrq\t{$idx, $len, $src|$src, $len, $idx}",
-                 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
+                 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
                                     imm:$idx))]>, PD;
 def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
               (ins VR128:$src, VR128:$mask),
@@ -7784,8 +7781,8 @@ def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
                    "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
-                   [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
-                                      VR128:$src2, imm:$len, imm:$idx))]>, XD;
+                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
+                                      imm:$len, imm:$idx))]>, XD;
 def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
                  (ins VR128:$src, VR128:$mask),
                  "insertq\t{$mask, $src|$src, $mask}",
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 61a33484b8bf..2c8b95bcba22 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -19,7 +19,7 @@ namespace llvm {
 enum IntrinsicType {
   INTR_NO_TYPE,
   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
-  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
+  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
   CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
   INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
   INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
@@ -213,18 +213,18 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
   X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, ISD::UMIN, 0),
   X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
@@ -596,60 +596,69 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_b_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_b_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_b_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_d_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_d_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_d_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_q_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_q_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_q_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_w_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_w_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_w_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_b_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_b_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_b_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_d_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_d_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_d_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_q_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_q_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_q_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_b_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_b_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_b_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_d_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_d_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_d_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_q_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_q_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_q_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_w_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_w_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_w_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_b_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_b_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_b_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_d_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_d_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_d_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_q_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_q_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_q_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK,
                      X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK,
                      X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmul_dq_512, INTR_TYPE_2OP_MASK,
                      X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_128, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_256, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_512, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmull_d_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmull_d_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmull_d_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
@@ -1008,10 +1017,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(sse2_pmaxs_w,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(sse2_pmaxu_b,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(sse2_pmins_w,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(sse2_pminu_b,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse2_pmaxs_w,      INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(sse2_pmaxu_b,      INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(sse2_pmins_w,      INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(sse2_pminu_b,      INTR_TYPE_2OP, ISD::UMIN, 0),
   X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
   X86_INTRINSIC_DATA(sse2_pmulhu_w,     INTR_TYPE_2OP, ISD::MULHU, 0),
   X86_INTRINSIC_DATA(sse2_pmulu_dq,     INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
@@ -1049,14 +1058,14 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse3_hsub_ps,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
   X86_INTRINSIC_DATA(sse41_insertps,    INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
   X86_INTRINSIC_DATA(sse41_packusdw,    INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(sse41_pmaxsb,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(sse41_pmaxsd,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(sse41_pmaxud,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(sse41_pmaxuw,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(sse41_pminsb,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(sse41_pminsd,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(sse41_pminud,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(sse41_pminuw,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxsb,      INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxsd,      INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxud,      INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxuw,      INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pminsb,      INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pminsd,      INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pminud,      INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pminuw,      INTR_TYPE_2OP, ISD::UMIN, 0),
   X86_INTRINSIC_DATA(sse41_pmovsxbd,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(sse41_pmovsxbq,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(sse41_pmovsxbw,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
@@ -1070,6 +1079,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse41_pmovzxwd,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
   X86_INTRINSIC_DATA(sse41_pmovzxwq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
   X86_INTRINSIC_DATA(sse41_pmuldq,      INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(sse4a_extrqi,      INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
+  X86_INTRINSIC_DATA(sse4a_insertqi,    INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
   X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index d598b55aae3e..e6db9708b677 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -30,59 +30,67 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// pointer for reasons other than it containing dynamic allocation or
   /// that FP eliminatation is turned off. For example, Cygwin main function
   /// contains stack pointer re-alignment code which requires FP.
-  bool ForceFramePointer;
+  bool ForceFramePointer = false;
 
   /// RestoreBasePointerOffset - Non-zero if the function has base pointer
   /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a
   /// displacement from the frame pointer to a slot where the base pointer
   /// is stashed.
-  signed char RestoreBasePointerOffset;
+  signed char RestoreBasePointerOffset = 0;
 
   /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
   /// stack frame in bytes.
-  unsigned CalleeSavedFrameSize;
+  unsigned CalleeSavedFrameSize = 0;
 
   /// BytesToPopOnReturn - Number of bytes function pops on return (in addition
   /// to the space used by the return address).
   /// Used on windows platform for stdcall & fastcall name decoration
-  unsigned BytesToPopOnReturn;
+  unsigned BytesToPopOnReturn = 0;
 
   /// ReturnAddrIndex - FrameIndex for return slot.
-  int ReturnAddrIndex;
+  int ReturnAddrIndex = 0;
 
   /// \brief FrameIndex for return slot.
-  int FrameAddrIndex;
+  int FrameAddrIndex = 0;
 
   /// TailCallReturnAddrDelta - The number of bytes by which return address
   /// stack slot is moved as the result of tail call optimization.
-  int TailCallReturnAddrDelta;
+  int TailCallReturnAddrDelta = 0;
 
   /// SRetReturnReg - Some subtargets require that sret lowering includes
   /// returning the value of the returned struct in a register. This field
   /// holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg;
+  unsigned SRetReturnReg = 0;
 
   /// GlobalBaseReg - keeps track of the virtual register initialized for
   /// use as the global base register. This is used for PIC in some PIC
   /// relocation models.
-  unsigned GlobalBaseReg;
+  unsigned GlobalBaseReg = 0;
 
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
-  int VarArgsFrameIndex;
+  int VarArgsFrameIndex = 0;
   /// RegSaveFrameIndex - X86-64 vararg func register save area.
-  int RegSaveFrameIndex;
+  int RegSaveFrameIndex = 0;
   /// VarArgsGPOffset - X86-64 vararg func int reg offset.
-  unsigned VarArgsGPOffset;
+  unsigned VarArgsGPOffset = 0;
   /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
-  unsigned VarArgsFPOffset;
+  unsigned VarArgsFPOffset = 0;
   /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
   /// being passed on the stack.
-  unsigned ArgumentStackSize;
+  unsigned ArgumentStackSize = 0;
   /// NumLocalDynamics - Number of local-dynamic TLS accesses.
-  unsigned NumLocalDynamics;
+  unsigned NumLocalDynamics = 0;
   /// HasPushSequences - Keeps track of whether this function uses sequences
   /// of pushes to pass function parameters.
-  bool HasPushSequences;
+  bool HasPushSequences = false;
+
+  /// True if the function uses llvm.x86.seh.restoreframe, and it needed a spill
+  /// slot for the frame pointer.
+  bool HasSEHFramePtrSave = false;
+
+  /// The frame index of a stack object containing the original frame pointer
+  /// used to address arguments in a function using a base pointer.
+  int SEHFramePtrSaveIndex = 0;
 
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
@@ -90,40 +98,9 @@ private:
   SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
 
 public:
-  X86MachineFunctionInfo() : ForceFramePointer(false),
-                             RestoreBasePointerOffset(0),
-                             CalleeSavedFrameSize(0),
-                             BytesToPopOnReturn(0),
-                             ReturnAddrIndex(0),
-                             FrameAddrIndex(0),
-                             TailCallReturnAddrDelta(0),
-                             SRetReturnReg(0),
-                             GlobalBaseReg(0),
-                             VarArgsFrameIndex(0),
-                             RegSaveFrameIndex(0),
-                             VarArgsGPOffset(0),
-                             VarArgsFPOffset(0),
-                             ArgumentStackSize(0),
-                             NumLocalDynamics(0),
-                             HasPushSequences(false) {}
-
-  explicit X86MachineFunctionInfo(MachineFunction &MF)
-    : ForceFramePointer(false),
-      RestoreBasePointerOffset(0),
-      CalleeSavedFrameSize(0),
-      BytesToPopOnReturn(0),
-      ReturnAddrIndex(0),
-      FrameAddrIndex(0),
-      TailCallReturnAddrDelta(0),
-      SRetReturnReg(0),
-      GlobalBaseReg(0),
-      VarArgsFrameIndex(0),
-      RegSaveFrameIndex(0),
-      VarArgsGPOffset(0),
-      VarArgsFPOffset(0),
-      ArgumentStackSize(0),
-      NumLocalDynamics(0),
-      HasPushSequences(false) {}
+  X86MachineFunctionInfo() = default;
+
+  explicit X86MachineFunctionInfo(MachineFunction &MF) {};
 
   bool getForceFramePointer() const { return ForceFramePointer;}
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
@@ -174,6 +151,12 @@ public:
   unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
 
+  bool getHasSEHFramePtrSave() const { return HasSEHFramePtrSave; }
+  void setHasSEHFramePtrSave(bool V) { HasSEHFramePtrSave = V; }
+
+  int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; }
+  void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; }
+
   SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
     return ForwardedMustTailRegParms;
   }
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 0033b5058187..d8495e53e0e3 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -202,7 +202,7 @@ X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
 unsigned
 X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                      MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const X86FrameLowering *TFI = getFrameLowering(MF);
 
   unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
   switch (RC->getID()) {
@@ -343,7 +343,7 @@ X86RegisterInfo::getNoPreservedMask() const {
 
 BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const X86FrameLowering *TFI = getFrameLowering(MF);
 
   // Set the stack-pointer register and its aliases as reserved.
   for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
@@ -452,7 +452,7 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
    // use both the SP and the FP, we need a separate base pointer register.
    bool CantUseFP = needsStackRealignment(MF);
    bool CantUseSP =
-       MFI->hasVarSizedObjects() || MFI->hasInlineAsmWithSPAdjust();
+       MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment();
    return CantUseFP && CantUseSP;
 }
 
@@ -477,9 +477,9 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
 
 bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const X86FrameLowering *TFI = getFrameLowering(MF);
   const Function *F = MF.getFunction();
-  unsigned StackAlign =
-    MF.getSubtarget().getFrameLowering()->getStackAlignment();
+  unsigned StackAlign = TFI->getStackAlignment();
   bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
                               F->hasFnAttribute(Attribute::StackAlignment));
 
@@ -503,7 +503,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                      RegScavenger *RS) const {
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const X86FrameLowering *TFI = getFrameLowering(MF);
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   unsigned BasePtr;
 
@@ -519,18 +519,17 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   else
     BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
 
-  // FRAME_ALLOC uses a single offset, with no register. It only works in the
+  // LOCAL_ESCAPE uses a single offset, with no register. It only works in the
   // simple FP case, and doesn't work with stack realignment. On 32-bit, the
   // offset is from the traditional base pointer location.  On 64-bit, the
   // offset is from the SP at the end of the prologue, not the FP location. This
   // matches the behavior of llvm.frameaddress.
-  if (Opc == TargetOpcode::FRAME_ALLOC) {
+  if (Opc == TargetOpcode::LOCAL_ESCAPE) {
     MachineOperand &FI = MI.getOperand(FIOperandNum);
     bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
     int Offset;
     if (IsWinEH)
-      Offset = static_cast<const X86FrameLowering *>(TFI)
-                     ->getFrameIndexOffsetFromSP(MF, FrameIndex);
+      Offset = TFI->getFrameIndexOffsetFromSP(MF, FrameIndex);
     else
       Offset = TFI->getFrameIndexOffset(MF, FrameIndex);
     FI.ChangeToImmediate(Offset);
@@ -584,7 +583,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 }
 
 unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const X86FrameLowering *TFI = getFrameLowering(MF);
   return TFI->hasFP(MF) ? FramePtr : StackPtr;
 }
 
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 5ca40bc0091b..ce79fcf9ad81 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -24,11 +24,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "x86-selectiondag-info"
 
-X86SelectionDAGInfo::X86SelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-X86SelectionDAGInfo::~X86SelectionDAGInfo() {}
-
 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
     SelectionDAG &DAG, ArrayRef<unsigned> ClobberSet) const {
   // We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -37,7 +32,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
   // dynamic stack adjustments (hopefully rare) and the base pointer would
   // conflict if we had to use it.
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  if (!MFI->hasVarSizedObjects() && !MFI->hasInlineAsmWithSPAdjust())
+  if (!MFI->hasVarSizedObjects() && !MFI->hasOpaqueSPAdjustment())
     return false;
 
   const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
@@ -81,8 +76,9 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
 
     if (const char *bzeroEntry =  V &&
         V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
-      EVT IntPtr = DAG.getTargetLoweringInfo().getPointerTy();
-      Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+      EVT IntPtr =
+          DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+      Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
       Entry.Node = Dst;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index eb7e0ed9de6c..961bd8c8d5ef 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h
@@ -29,8 +29,7 @@ class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
                                  ArrayRef<unsigned> ClobberSet) const;
 
 public:
-  explicit X86SelectionDAGInfo(const DataLayout &DL);
-  ~X86SelectionDAGInfo();
+  explicit X86SelectionDAGInfo() = default;
 
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 3b25d30dc221..dff3624b7efe 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -68,7 +68,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
   if (GV->hasDLLImportStorageClass())
     return X86II::MO_DLLIMPORT;
 
-  bool isDecl = GV->isDeclarationForLinker();
+  bool isDef = GV->isStrongDefinitionForLinker();
 
   // X86-64 in PIC mode.
   if (isPICStyleRIPRel()) {
@@ -80,8 +80,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
       // If symbol visibility is hidden, the extra load is not needed if
       // target is x86-64 or the symbol is definitely defined in the current
       // translation unit.
-      if (GV->hasDefaultVisibility() &&
-          (isDecl || GV->isWeakForLinker()))
+      if (GV->hasDefaultVisibility() && !isDef)
         return X86II::MO_GOTPCREL;
     } else if (!isTargetWin64()) {
       assert(isTargetELF() && "Unknown rip-relative target");
@@ -107,7 +106,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
 
     // If this is a strong reference to a definition, it is definitely not
     // through a stub.
-    if (!isDecl && !GV->isWeakForLinker())
+    if (isDef)
       return X86II::MO_PIC_BASE_OFFSET;
 
     // Unless we have a symbol with hidden visibility, we have to go through a
@@ -117,7 +116,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
 
     // If symbol visibility is hidden, we have a stub for common symbol
     // references and external declarations.
-    if (isDecl || GV->hasCommonLinkage()) {
+    if (GV->isDeclarationForLinker() || GV->hasCommonLinkage()) {
       // Hidden $non_lazy_ptr reference.
       return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE;
     }
@@ -131,7 +130,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
 
     // If this is a strong reference to a definition, it is definitely not
     // through a stub.
-    if (!isDecl && !GV->isWeakForLinker())
+    if (isDef)
       return X86II::MO_NO_FLAG;
 
     // Unless we have a symbol with hidden visibility, we have to go through a
@@ -193,12 +192,9 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       FullFS = "+64bit,+sse2";
   }
 
-  // If feature string is not empty, parse features string.
+  // Parse features string and set the CPU.
   ParseSubtargetFeatures(CPUName, FullFS);
 
-  // Make sure the right MCSchedModel is used.
-  InitCPUSchedModel(CPUName);
-
   InstrItins = getInstrItineraryForCPU(CPUName);
 
   // It's important to keep the MCSubtargetInfo feature bits in sync with
@@ -298,9 +294,8 @@ X86Subtarget::X86Subtarget(const Triple &TT, const std::string &CPU,
                   TargetTriple.getEnvironment() != Triple::CODE16),
       In16BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() == Triple::CODE16),
-      TSInfo(*TM.getDataLayout()),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      FrameLowering(*this, getStackAlignment()) {
+      TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
   // Determine the PICStyle based on the target selected.
   if (TM.getRelocationModel() == Reloc::Static) {
     // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index d420abbe1433..f026d4295f71 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -447,8 +447,26 @@ public:
   }
 
   bool isCallingConvWin64(CallingConv::ID CC) const {
-    return (isTargetWin64() && CC != CallingConv::X86_64_SysV) ||
-           CC == CallingConv::X86_64_Win64;
+    switch (CC) {
+    // On Win64, all these conventions just use the default convention.
+    case CallingConv::C:
+    case CallingConv::Fast:
+    case CallingConv::X86_FastCall:
+    case CallingConv::X86_StdCall:
+    case CallingConv::X86_ThisCall:
+    case CallingConv::X86_VectorCall:
+    case CallingConv::Intel_OCL_BI:
+      return isTargetWin64();
+    // This convention allows using the Win64 convention on other targets.
+    case CallingConv::X86_64_Win64:
+      return true;
+    // This convention allows using the SysV convention on Windows targets.
+    case CallingConv::X86_64_SysV:
+      return false;
+    // Otherwise, who knows what this is.
+    default:
+      return false;
+    }
   }
 
   /// ClassifyGlobalReference - Classify a global variable reference for the
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 0c82a700952b..7df726091843 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -89,7 +89,7 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
@@ -117,6 +117,8 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
 
   static const CostTblEntry<MVT::SimpleValueType>
   AVX2UniformConstCostTable[] = {
+    { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
+
     { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
     { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
     { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
@@ -211,6 +213,7 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v16i8,  4 }, // psrlw, pand, pxor, psubb.
     { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
     { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
+    { ISD::SRA,  MVT::v2i64,  4 }, // 2 x psrad + shuffle.
 
     { ISD::SDIV, MVT::v8i16,  6 }, // pmulhw sequence
     { ISD::UDIV, MVT::v8i16,  6 }, // pmulhuw sequence
@@ -261,12 +264,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
 
     { ISD::SRL,  MVT::v16i8,    26 }, // cmpgtb sequence.
     { ISD::SRL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v4i32,  4*10 }, // Scalarized.
+    { ISD::SRL,  MVT::v4i32,    16 }, // Shift each lane + blend.
     { ISD::SRL,  MVT::v2i64,  2*10 }, // Scalarized.
 
     { ISD::SRA,  MVT::v16i8,    54 }, // unpacked cmpgtb sequence.
     { ISD::SRA,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRA,  MVT::v4i32,  4*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v4i32,    16 }, // Shift each lane + blend.
     { ISD::SRA,  MVT::v2i64,  2*10 }, // Scalarized.
 
     // It is not a good idea to vectorize division. We have to scalarize it and
@@ -352,7 +355,7 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 
   if (Kind == TTI::SK_Reverse) {
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
     unsigned Cost = 1;
     if (LT.second.getSizeInBits() > 128)
       Cost = 3; // Extract + insert + copy.
@@ -364,7 +367,7 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   if (Kind == TTI::SK_Alternate) {
     // 64-bit packed float vectors (v2f32) are widened to type v4f32.
     // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
     // The backend knows how to generate a single VEX.256 version of
     // instruction VPBLENDW if the target supports AVX2.
@@ -464,8 +467,8 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src);
-  std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst);
+  std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
+  std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
 
   static const TypeConversionCostTblEntry<MVT::SimpleValueType>
   SSE2ConvTbl[] = {
@@ -537,8 +540,8 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     if (Idx != -1)
       return AVX512ConversionTbl[Idx].Cost;
   }
-  EVT SrcTy = TLI->getValueType(Src);
-  EVT DstTy = TLI->getValueType(Dst);
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
 
   // The function getSimpleVT only handles simple value types.
   if (!SrcTy.isSimple() || !DstTy.isSimple())
@@ -667,7 +670,7 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
 unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                         Type *CondTy) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
   MVT MTy = LT.second;
 
@@ -740,7 +743,7 @@ unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 
   if (Index != -1U) {
     // Legalize the type.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
 
     // This type is legalized to a scalar type.
     if (!LT.second.isVector())
@@ -803,7 +806,7 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   }
 
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
          "Invalid Opcode");
 
@@ -850,9 +853,9 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
   }
 
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(SrcVTy);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
   unsigned Cost = 0;
-  if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() &&
+  if (LT.second != TLI->getValueType(DL, SrcVTy).getSimpleVT() &&
       LT.second.getVectorNumElements() == NumElem)
     // Promotion requires expand/truncate for data and a shuffle for mask.
     Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) +
@@ -887,7 +890,7 @@ unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
 unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
                                       bool IsPairwise) {
 
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
   MVT MTy = LT.second;
 
@@ -1117,11 +1120,11 @@ unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
 
 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) {
   int DataWidth = DataTy->getPrimitiveSizeInBits();
-  
+
   // Todo: AVX512 allows gather/scatter, works with strided and random as well
   if ((DataWidth < 32) || (Consecutive == 0))
     return false;
-  if (ST->hasAVX512() || ST->hasAVX2()) 
+  if (ST->hasAVX512() || ST->hasAVX2())
     return true;
   return false;
 }
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index a83158440193..da3f36c2e27e 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -40,7 +40,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
 
 public:
   explicit X86TTIImpl(const X86TargetMachine *TM, Function &F)
-      : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   X86TTIImpl(const X86TTIImpl &Arg)
@@ -48,18 +49,6 @@ public:
   X86TTIImpl(X86TTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  X86TTIImpl &operator=(const X86TTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  X86TTIImpl &operator=(X86TTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   /// \name Scalar TTI Implementations
   /// @{
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 90357257b9ef..9190d0be9e4d 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -113,8 +113,8 @@ char WinEHStatePass::ID = 0;
 
 bool WinEHStatePass::doInitialization(Module &M) {
   TheModule = &M;
-  FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::frameescape);
-  FrameRecover = Intrinsic::getDeclaration(TheModule, Intrinsic::framerecover);
+  FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::localescape);
+  FrameRecover = Intrinsic::getDeclaration(TheModule, Intrinsic::localrecover);
   FrameAddress = Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress);
   return false;
 }
@@ -133,7 +133,7 @@ bool WinEHStatePass::doFinalization(Module &M) {
 
 void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const {
   // This pass should only insert a stack allocation, memory accesses, and
-  // framerecovers.
+  // localrecovers.
   AU.setPreservesCFG();
 }
 
@@ -336,9 +336,11 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
   FunctionType *TargetFuncTy =
       FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 5),
                         /*isVarArg=*/false);
-  Function *Trampoline = Function::Create(
-      TrampolineTy, GlobalValue::InternalLinkage,
-      Twine("__ehhandler$") + ParentFunc->getName(), TheModule);
+  Function *Trampoline =
+      Function::Create(TrampolineTy, GlobalValue::InternalLinkage,
+                       Twine("__ehhandler$") + GlobalValue::getRealLinkageName(
+                                                   ParentFunc->getName()),
+                       TheModule);
   BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
   IRBuilder<> Builder(EntryBB);
   Value *LSDA = emitEHLSDA(Builder, ParentFunc);
@@ -419,14 +421,14 @@ void WinEHStatePass::addCXXStateStores(Function &F, MachineModuleInfo &MMI) {
 }
 
 /// Escape RegNode so that we can access it from child handlers. Find the call
-/// to frameescape, if any, in the entry block and append RegNode to the list
+/// to localescape, if any, in the entry block and append RegNode to the list
 /// of arguments.
 int WinEHStatePass::escapeRegNode(Function &F) {
-  // Find the call to frameescape and extract its arguments.
+  // Find the call to localescape and extract its arguments.
   IntrinsicInst *EscapeCall = nullptr;
   for (Instruction &I : F.getEntryBlock()) {
     IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
-    if (II && II->getIntrinsicID() == Intrinsic::frameescape) {
+    if (II && II->getIntrinsicID() == Intrinsic::localescape) {
       EscapeCall = II;
       break;
     }
@@ -440,8 +442,10 @@ int WinEHStatePass::escapeRegNode(Function &F) {
 
   // Replace the call (if it exists) with new one. Otherwise, insert at the end
   // of the entry block.
-  IRBuilder<> Builder(&F.getEntryBlock(),
-                      EscapeCall ? EscapeCall : F.getEntryBlock().end());
+  Instruction *InsertPt = EscapeCall;
+  if (!EscapeCall)
+    InsertPt = F.getEntryBlock().getTerminator();
+  IRBuilder<> Builder(&F.getEntryBlock(), InsertPt);
   Builder.CreateCall(FrameEscape, Args);
   if (EscapeCall)
     EscapeCall->eraseFromParent();
@@ -520,6 +524,11 @@ void WinEHStatePass::addSEHStateStores(Function &F, MachineModuleInfo &MMI) {
           for (auto &Handler : ActionList) {
             if (auto *CH = dyn_cast<CatchHandler>(Handler.get())) {
               auto *BA = cast<BlockAddress>(CH->getHandlerBlockOrFunc());
+#ifndef NDEBUG
+              for (BasicBlock *Pred : predecessors(BA->getBasicBlock()))
+                assert(Pred->isLandingPad() &&
+                       "WinEHPrepare failed to split block");
+#endif
               ExceptBlocks.insert(BA->getBasicBlock());
             }
           }
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index ac954d0a8fa4..b4085835f285 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -40,7 +40,7 @@ static MCInstrInfo *createXCoreMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createXCoreMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createXCoreMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitXCoreMCRegisterInfo(X, XCore::LR);
   return X;
@@ -48,9 +48,7 @@ static MCRegisterInfo *createXCoreMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createXCoreMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitXCoreMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createXCoreMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
 static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
@@ -64,7 +62,8 @@ static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createXCoreMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createXCoreMCCodeGenInfo(const Triple &TT,
+                                               Reloc::Model RM,
                                                CodeModel::Model CM,
                                                CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index bd834cc5be4b..76c3d8130e75 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -525,12 +525,15 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-void XCoreFrameLowering::
-processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS) const {
+void XCoreFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                              BitVector &SavedRegs,
+                                              RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
 
-  bool LRUsed = MF.getRegInfo().isPhysRegUsed(XCore::LR);
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool LRUsed = MRI.isPhysRegModified(XCore::LR);
 
   if (!LRUsed && !MF.getFunction()->isVarArg() &&
       MF.getFrameInfo()->estimateStackSize(MF))
@@ -550,7 +553,7 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   if (LRUsed) {
     // We will handle the LR in the prologue/epilogue
     // and allocate space on the stack ourselves.
-    MF.getRegInfo().setPhysRegUnused(XCore::LR);
+    SavedRegs.reset(XCore::LR);
     XFI->createLRSpillSlot(MF);
   }
 
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index 607c77248952..69c71adc8d3f 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -47,8 +47,8 @@ namespace llvm {
 
     bool hasFP(const MachineFunction &MF) const override;
 
-    void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS = nullptr) const override;
+    void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                              RegScavenger *RS = nullptr) const override;
 
     void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                      RegScavenger *RS = nullptr) const override;
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index f5b180b1ac0d..9d4a966dfba4 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -144,10 +144,9 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
                                     MVT::i32, MskSize);
     }
     else if (!isUInt<16>(Val)) {
-      SDValue CPIdx =
-        CurDAG->getTargetConstantPool(ConstantInt::get(
-                              Type::getInt32Ty(*CurDAG->getContext()), Val),
-                                      getTargetLowering()->getPointerTy());
+      SDValue CPIdx = CurDAG->getTargetConstantPool(
+          ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
+          getTargetLowering()->getPointerTy(CurDAG->getDataLayout()));
       SDNode *node = CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32,
                                             MVT::Other, CPIdx,
                                             CurDAG->getEntryNode());
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index aa71241102ff..d62e7428299d 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -281,7 +281,8 @@ static bool IsSmallObject(const GlobalValue *GV, const XCoreTargetLowering &XTL)
   if (!ObjType->isSized())
     return false;
 
-  unsigned ObjSize = XTL.getDataLayout()->getTypeAllocSize(ObjType);
+  auto &DL = GV->getParent()->getDataLayout();
+  unsigned ObjSize = DL.getTypeAllocSize(ObjType);
   return ObjSize < CodeModelLargeSize && ObjSize != 0;
 }
 
@@ -312,8 +313,9 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
     Constant *GAI = ConstantExpr::getGetElementPtr(
         Type::getInt8Ty(*DAG.getContext()), GA, Idx);
     SDValue CP = DAG.getConstantPool(GAI, MVT::i32);
-    return DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), CP,
-                       MachinePointerInfo(), false, false, false, 0);
+    return DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL,
+                       DAG.getEntryNode(), CP, MachinePointerInfo(), false,
+                       false, false, 0);
   }
 }
 
@@ -321,11 +323,11 @@ SDValue XCoreTargetLowering::
 LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
 {
   SDLoc DL(Op);
-
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT);
 
-  return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, getPointerTy(), Result);
+  return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, PtrVT, Result);
 }
 
 SDValue XCoreTargetLowering::
@@ -378,9 +380,10 @@ SDValue XCoreTargetLowering::
 lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base,
                                        int64_t Offset, SelectionDAG &DAG) const
 {
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
   if ((Offset & 0x3) == 0) {
-    return DAG.getLoad(getPointerTy(), DL, Chain, Base, MachinePointerInfo(),
-                       false, false, false, 0);
+    return DAG.getLoad(PtrVT, DL, Chain, Base, MachinePointerInfo(), false,
+                       false, false, 0);
   }
   // Lower to pair of consecutive word aligned loads plus some bit shifting.
   int32_t HighOffset = RoundUpToAlignment(Offset, 4);
@@ -401,11 +404,9 @@ lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base,
   SDValue LowShift = DAG.getConstant((Offset - LowOffset) * 8, DL, MVT::i32);
   SDValue HighShift = DAG.getConstant((HighOffset - Offset) * 8, DL, MVT::i32);
 
-  SDValue Low = DAG.getLoad(getPointerTy(), DL, Chain,
-                            LowAddr, MachinePointerInfo(),
+  SDValue Low = DAG.getLoad(PtrVT, DL, Chain, LowAddr, MachinePointerInfo(),
                             false, false, false, 0);
-  SDValue High = DAG.getLoad(getPointerTy(), DL, Chain,
-                             HighAddr, MachinePointerInfo(),
+  SDValue High = DAG.getLoad(PtrVT, DL, Chain, HighAddr, MachinePointerInfo(),
                              false, false, false, 0);
   SDValue LowShifted = DAG.getNode(ISD::SRL, DL, MVT::i32, Low, LowShift);
   SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High, HighShift);
@@ -435,8 +436,9 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
                                      LD->getAlignment()))
     return SDValue();
 
-  unsigned ABIAlignment = getDataLayout()->
-    getABITypeAlignment(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  auto &TD = DAG.getDataLayout();
+  unsigned ABIAlignment = TD.getABITypeAlignment(
+      LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
   // Leave aligned load alone.
   if (LD->getAlignment() >= ABIAlignment)
     return SDValue();
@@ -486,7 +488,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Lower to a call to __misaligned_load(BasePtr).
-  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = TD.getIntPtrType(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
@@ -495,10 +497,11 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   Args.push_back(Entry);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(DL).setChain(Chain)
-    .setCallee(CallingConv::C, IntPtrTy,
-               DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
-               std::move(Args), 0);
+  CLI.setDebugLoc(DL).setChain(Chain).setCallee(
+      CallingConv::C, IntPtrTy,
+      DAG.getExternalSymbol("__misaligned_load",
+                            getPointerTy(DAG.getDataLayout())),
+      std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   SDValue Ops[] = { CallResult.first, CallResult.second };
@@ -516,8 +519,8 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
                                      ST->getAlignment())) {
     return SDValue();
   }
-  unsigned ABIAlignment = getDataLayout()->
-    getABITypeAlignment(ST->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(
+      ST->getMemoryVT().getTypeForEVT(*DAG.getContext()));
   // Leave aligned store alone.
   if (ST->getAlignment() >= ABIAlignment) {
     return SDValue();
@@ -545,7 +548,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   }
 
   // Lower to a call to __misaligned_store(BasePtr, Value).
-  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
@@ -557,10 +560,11 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   Args.push_back(Entry);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-               DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
-               std::move(Args), 0);
+  CLI.setDebugLoc(dl).setChain(Chain).setCallee(
+      CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+      DAG.getExternalSymbol("__misaligned_store",
+                            getPointerTy(DAG.getDataLayout())),
+      std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
@@ -833,9 +837,9 @@ LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   int FI = XFI->createLRSpillSlot(MF);
   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
-  return DAG.getLoad(getPointerTy(), SDLoc(Op), DAG.getEntryNode(), FIN,
-                     MachinePointerInfo::getFixedStack(FI), false, false,
-                     false, 0);
+  return DAG.getLoad(
+      getPointerTy(DAG.getDataLayout()), SDLoc(Op), DAG.getEntryNode(), FIN,
+      MachinePointerInfo::getFixedStack(FI), false, false, false, 0);
 }
 
 SDValue XCoreTargetLowering::
@@ -979,11 +983,10 @@ LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const {
   if (N->getMemoryVT() == MVT::i32) {
     if (N->getAlignment() < 4)
       report_fatal_error("atomic load must be aligned");
-    return DAG.getLoad(getPointerTy(), SDLoc(Op), N->getChain(),
-                       N->getBasePtr(), N->getPointerInfo(),
-                       N->isVolatile(), N->isNonTemporal(),
-                       N->isInvariant(), N->getAlignment(),
-                       N->getAAInfo(), N->getRanges());
+    return DAG.getLoad(getPointerTy(DAG.getDataLayout()), SDLoc(Op),
+                       N->getChain(), N->getBasePtr(), N->getPointerInfo(),
+                       N->isVolatile(), N->isNonTemporal(), N->isInvariant(),
+                       N->getAlignment(), N->getAAInfo(), N->getRanges());
   }
   if (N->getMemoryVT() == MVT::i16) {
     if (N->getAlignment() < 2)
@@ -1150,9 +1153,10 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = RetCCInfo.getNextStackOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, dl,
-                                 getPointerTy(), true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -1239,11 +1243,8 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
-  Chain = DAG.getCALLSEQ_END(Chain,
-                             DAG.getConstant(NumBytes, dl, getPointerTy(),
-                                             true),
-                             DAG.getConstant(0, dl, getPointerTy(), true),
-                             InFlag, dl);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, dl, PtrVT, true),
+                             DAG.getConstant(0, dl, PtrVT, true), InFlag, dl);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
@@ -1830,7 +1831,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     if (StoreBits % 8) {
       break;
     }
-    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(
+    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(
         ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext()));
     unsigned Alignment = ST->getAlignment();
     if (Alignment >= ABIAlignment) {
@@ -1924,15 +1925,13 @@ static inline bool isImmUs4(int64_t val)
 
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
-bool
-XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                           Type *Ty,
-                                           unsigned AS) const {
+bool XCoreTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
+                                                unsigned AS) const {
   if (Ty->getTypeID() == Type::VoidTyID)
     return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
 
-  const DataLayout *TD = TM.getDataLayout();
-  unsigned Size = TD->getTypeAllocSize(Ty);
+  unsigned Size = DL.getTypeAllocSize(Ty);
   if (AM.BaseGV) {
     return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 &&
                  AM.BaseOffs%4 == 0;
@@ -1970,7 +1969,7 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 
 std::pair<unsigned, const TargetRegisterClass *>
 XCoreTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                  const std::string &Constraint,
+                                                  StringRef Constraint,
                                                   MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 97f0494b6fe3..ddd675c5164d 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -101,7 +101,9 @@ namespace llvm {
 
 
     unsigned getJumpTableEncoding() const override;
-    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+    MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override {
+      return MVT::i32;
+    }
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -120,8 +122,8 @@ namespace llvm {
       EmitInstrWithCustomInserter(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const override;
 
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                               unsigned AS) const override;
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
 
   private:
     const TargetMachine &TM;
@@ -175,8 +177,7 @@ namespace llvm {
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     // Expand specifics
     SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 1d569e8936df..1cfb57dc3af3 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -222,7 +222,7 @@ XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     XCore::R8, XCore::R9,
     0
   };
-  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  const XCoreFrameLowering *TFI = getFrameLowering(*MF);
   if (TFI->hasFP(*MF))
     return CalleeSavedRegsFP;
   return CalleeSavedRegs;
@@ -230,7 +230,7 @@ XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 BitVector XCoreRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const XCoreFrameLowering *TFI = getFrameLowering(MF);
 
   Reserved.set(XCore::CP);
   Reserved.set(XCore::DP);
@@ -270,7 +270,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   const XCoreInstrInfo &TII =
       *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
 
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const XCoreFrameLowering *TFI = getFrameLowering(MF);
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
   int StackSize = MF.getFrameInfo()->getStackSize();
 
@@ -324,7 +324,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
 
 unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const XCoreFrameLowering *TFI = getFrameLowering(MF);
 
   return TFI->hasFP(MF) ? XCore::R10 : XCore::SP;
 }
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index a34884480cea..40568d124de0 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -16,12 +16,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "xcore-selectiondag-info"
 
-XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-XCoreSelectionDAGInfo::~XCoreSelectionDAGInfo() {
-}
-
 SDValue XCoreSelectionDAGInfo::
 EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
                         SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
@@ -36,18 +30,20 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
     const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
     TargetLowering::ArgListTy Args;
     TargetLowering::ArgListEntry Entry;
-    Entry.Ty = TLI.getDataLayout()->getIntPtrType(*DAG.getContext());
+    Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
     Entry.Node = Dst; Args.push_back(Entry);
     Entry.Node = Src; Args.push_back(Entry);
     Entry.Node = Size; Args.push_back(Entry);
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(dl).setChain(Chain)
-      .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
-                 Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()),
-                 std::move(Args), 0)
-      .setDiscardResult();
+    CLI.setDebugLoc(dl)
+        .setChain(Chain)
+        .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                   Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol("__memcpy_4",
+                                         TLI.getPointerTy(DAG.getDataLayout())),
+                   std::move(Args), 0)
+        .setDiscardResult();
 
     std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
     return CallResult.second;
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h
index cfd80b3f3172..77b3527d77e3 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.h
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -22,8 +22,6 @@ class XCoreTargetMachine;
 
 class XCoreSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit XCoreSelectionDAGInfo(const DataLayout &DL);
-  ~XCoreSelectionDAGInfo();
 
   SDValue
   EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp
index c98518b60225..99ad2c88504f 100644
--- a/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/lib/Target/XCore/XCoreSubtarget.cpp
@@ -28,4 +28,4 @@ void XCoreSubtarget::anchor() { }
 XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS, const TargetMachine &TM)
     : XCoreGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this),
-      TLInfo(TM, *this), TSInfo(*TM.getDataLayout()) {}
+      TLInfo(TM, *this), TSInfo() {}
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 370b64b26688..f420081868f9 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -85,6 +85,7 @@ extern "C" void LLVMInitializeXCoreTarget() {
 }
 
 TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis(
-      [this](Function &) { return TargetTransformInfo(XCoreTTIImpl(this)); });
+  return TargetIRAnalysis([this](Function &F) {
+    return TargetTransformInfo(XCoreTTIImpl(this, F));
+  });
 }
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.h b/lib/Target/XCore/XCoreTargetTransformInfo.h
index 70b47dfa1156..e23aef3e3b4a 100644
--- a/lib/Target/XCore/XCoreTargetTransformInfo.h
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -37,8 +37,9 @@ class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> {
   const XCoreTargetLowering *getTLI() const { return TLI; }
 
 public:
-  explicit XCoreTTIImpl(const XCoreTargetMachine *TM)
-      : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
+  explicit XCoreTTIImpl(const XCoreTargetMachine *TM, Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   XCoreTTIImpl(const XCoreTTIImpl &Arg)
@@ -46,18 +47,6 @@ public:
   XCoreTTIImpl(XCoreTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  XCoreTTIImpl &operator=(const XCoreTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  XCoreTTIImpl &operator=(XCoreTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   unsigned getNumberOfRegisters(bool Vector) {
     if (Vector) {
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index f75436328252..4762011d63d8 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -825,7 +825,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
             V = GetElementPtrInst::Create(SI->first, V, Ops,
                                           V->getName() + ".idx", Call);
             Ops.clear();
-            AA.copyValue(OrigLoad->getOperand(0), V);
           }
           // Since we're replacing a load make sure we take the alignment
           // of the previous load.
@@ -837,7 +836,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
           newLoad->setAAMetadata(AAInfo);
 
           Args.push_back(newLoad);
-          AA.copyValue(OrigLoad, Args.back());
         }
       }
 
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 3df17b920a95..336dac45e13a 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMipo
   BarrierNoopPass.cpp
   ConstantMerge.cpp
   DeadArgumentElimination.cpp
+  ElimAvailExtern.cpp
   ExtractGV.cpp
   FunctionAttrs.cpp
   GlobalDCE.cpp
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 76898f275058..d0447640259e 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -326,7 +326,18 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
 /// instead.
 bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
 {
-  if (Fn.isDeclaration() || Fn.mayBeOverridden())
+  // We cannot change the arguments if this TU does not define the function or
+  // if the linker may choose a function body from another TU, even if the
+  // nominal linkage indicates that other copies of the function have the same
+  // semantics. In the below example, the dead load from %p may not have been
+  // eliminated from the linker-chosen copy of f, so replacing %p with undef
+  // in callers may introduce undefined behavior.
+  //
+  // define linkonce_odr void @f(i32* %p) {
+  //   %v = load i32 %p
+  //   ret void
+  // }
+  if (!Fn.isStrongDefinitionForLinker())
     return false;
 
   // Functions with local linkage should already have been handled, except the
@@ -334,19 +345,6 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
   if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())
     return false;
 
-  // If a function seen at compile time is not necessarily the one linked to
-  // the binary being built, it is illegal to change the actual arguments
-  // passed to it. These functions can be captured by isWeakForLinker().
-  // *NOTE* that mayBeOverridden() is insufficient for this purpose as it
-  // doesn't include linkage types like AvailableExternallyLinkage and
-  // LinkOnceODRLinkage. Take link_odr* as an example, it indicates a set of
-  // *EQUIVALENT* globals that can be merged at link-time. However, the
-  // semantic of *EQUIVALENT*-functions includes parameters. Changing
-  // parameters breaks this assumption.
-  //
-  if (Fn.isWeakForLinker())
-    return false;
-
   if (Fn.use_empty())
     return false;
 
diff --git a/lib/Transforms/IPO/ElimAvailExtern.cpp b/lib/Transforms/IPO/ElimAvailExtern.cpp
new file mode 100644
index 000000000000..67ba72d6a360
--- /dev/null
+++ b/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -0,0 +1,84 @@
+//===-- ElimAvailExtern.cpp - DCE unreachable internal functions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transform is designed to eliminate available external global
+// definitions from the program, turning them into declarations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "elim-avail-extern"
+
+STATISTIC(NumFunctions, "Number of functions removed");
+STATISTIC(NumVariables, "Number of global variables removed");
+
+namespace {
+  struct EliminateAvailableExternally : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    EliminateAvailableExternally() : ModulePass(ID) {
+      initializeEliminateAvailableExternallyPass(
+          *PassRegistry::getPassRegistry());
+    }
+
+    // run - Do the EliminateAvailableExternally pass on the specified module,
+    // optionally updating the specified callgraph to reflect the changes.
+    //
+    bool runOnModule(Module &M) override;
+  };
+}
+
+char EliminateAvailableExternally::ID = 0;
+INITIALIZE_PASS(EliminateAvailableExternally, "elim-avail-extern",
+                "Eliminate Available Externally Globals", false, false)
+
+ModulePass *llvm::createEliminateAvailableExternallyPass() {
+  return new EliminateAvailableExternally();
+}
+
+bool EliminateAvailableExternally::runOnModule(Module &M) {
+  bool Changed = false;
+
+  // Drop initializers of available externally global variables.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (!I->hasAvailableExternallyLinkage())
+      continue;
+    if (I->hasInitializer()) {
+      Constant *Init = I->getInitializer();
+      I->setInitializer(nullptr);
+      if (isSafeToDestroyConstant(Init))
+        Init->destroyConstant();
+    }
+    I->removeDeadConstantUsers();
+    I->setLinkage(GlobalValue::ExternalLinkage);
+    NumVariables++;
+  }
+
+  // Drop the bodies of available externally functions.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    if (!I->hasAvailableExternallyLinkage())
+      continue;
+    if (!I->isDeclaration())
+      // This will set the linkage to external
+      I->deleteBody();
+    I->removeDeadConstantUsers();
+    NumFunctions++;
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index 2f8c7d9349b9..b9462f2ffc72 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -93,8 +93,11 @@ namespace {
 
         makeVisible(*I, Delete);
 
-        if (Delete)
+        if (Delete) {
+          // Make this a declaration and drop it's comdat.
           I->setInitializer(nullptr);
+          I->setComdat(nullptr);
+        }
       }
 
       // Visit the Functions.
@@ -108,8 +111,11 @@ namespace {
 
         makeVisible(*I, Delete);
 
-        if (Delete)
+        if (Delete) {
+          // Make this a declaration and drop it's comdat.
           I->deleteBody();
+          I->setComdat(nullptr);
+        }
       }
 
       // Visit the Aliases.
diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp
index fcacec3286fa..50f56b0f2afe 100644
--- a/lib/Transforms/IPO/IPO.cpp
+++ b/lib/Transforms/IPO/IPO.cpp
@@ -46,6 +46,7 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeStripDeadDebugInfoPass(Registry);
   initializeStripNonDebugSymbolsPass(Registry);
   initializeBarrierNoopPass(Registry);
+  initializeEliminateAvailableExternallyPass(Registry);
 }
 
 void LLVMInitializeIPO(LLVMPassRegistryRef R) {
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 963f1bb13aaf..88e5e479136f 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -105,6 +105,7 @@ PassManagerBuilder::PassManagerBuilder() {
     VerifyInput = false;
     VerifyOutput = false;
     MergeFunctions = false;
+    PrepareForLTO = false;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -319,8 +320,8 @@ void PassManagerBuilder::populateModulePassManager(
 
   // Re-rotate loops in all our loop nests. These may have fallout out of
   // rotated form due to GVN or other transformations, and the vectorizer relies
-  // on the rotated form.
-  MPM.add(createLoopRotatePass());
+  // on the rotated form. Disable header duplication at -Oz.
+  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
 
   // Distribute loops to allow partial vectorization.  I.e. isolate dependences
   // into separate loop that would otherwise inhibit vectorization.
@@ -401,6 +402,17 @@ void PassManagerBuilder::populateModulePassManager(
     // GlobalOpt already deletes dead functions and globals, at -O2 try a
     // late pass of GlobalDCE.  It is capable of deleting dead cycles.
     if (OptLevel > 1) {
+      if (!PrepareForLTO) {
+        // Remove avail extern fns and globals definitions if we aren't
+        // compiling an object file for later LTO. For LTO we want to preserve
+        // these so they are eligible for inlining at link-time. Note if they
+        // are unreferenced they will be removed by GlobalDCE below, so
+        // this only impacts referenced available externally globals.
+        // Eventually they will be suppressed during codegen, but eliminating
+        // here enables more opportunity for GlobalDCE as it may make
+        // globals referenced by available external functions dead.
+        MPM.add(createEliminateAvailableExternallyPass());
+      }
       MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
       MPM.add(createConstantMergePass());     // Merge dup global constants
     }
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 010b7b57c3e7..0bd6fd2f226d 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -3928,8 +3928,8 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
-  if (Value *V =
-          SimplifyFCmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AC, &I))
+  if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1,
+                                  I.getFastMathFlags(), DL, TLI, DT, AC, &I))
     return ReplaceInstUsesWith(I, V);
 
   // Simplify 'fcmp pred X, X'
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 97ea8df757f8..ac934f1bd85c 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
 #define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
 
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetFolder.h"
@@ -177,6 +178,8 @@ private:
   // Mode in which we are running the combiner.
   const bool MinimizeSize;
 
+  AliasAnalysis *AA;
+
   // Required analyses.
   // FIXME: These can never be null and should be references.
   AssumptionCache *AC;
@@ -192,10 +195,11 @@ private:
 
 public:
   InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder,
-               bool MinimizeSize, AssumptionCache *AC, TargetLibraryInfo *TLI,
+               bool MinimizeSize, AliasAnalysis *AA,
+               AssumptionCache *AC, TargetLibraryInfo *TLI,
                DominatorTree *DT, const DataLayout &DL, LoopInfo *LI)
       : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
-        AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {}
+        AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {}
 
   /// \brief Run the combiner over the entire worklist until it is empty.
   ///
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index e7a45330d955..e3179dbeece8 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -749,10 +749,25 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   // where there are several consecutive memory accesses to the same location,
   // separated by a few arithmetic operations.
   BasicBlock::iterator BBI = &LI;
-  if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6))
+  AAMDNodes AATags;
+  if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,
+                                                     6, AA, &AATags)) {
+    if (LoadInst *NLI = dyn_cast<LoadInst>(AvailableVal)) {
+      unsigned KnownIDs[] = {
+        LLVMContext::MD_tbaa,
+        LLVMContext::MD_alias_scope,
+        LLVMContext::MD_noalias,
+        LLVMContext::MD_range,
+        LLVMContext::MD_invariant_load,
+        LLVMContext::MD_nonnull,
+      };
+      combineMetadata(NLI, &LI, KnownIDs);
+    };
+
     return ReplaceInstUsesWith(
         LI, Builder->CreateBitOrPointerCast(AvailableVal, LI.getType(),
                                             LI.getName() + ".cast"));
+  }
 
   // load(gep null, ...) -> unreachable
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 24446c8578e0..273047279e90 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -14,6 +14,8 @@
 
 #include "InstCombineInternal.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/PatternMatch.h"
 using namespace llvm;
 using namespace PatternMatch;
@@ -60,56 +62,6 @@ static bool CheapToScalarize(Value *V, bool isConstant) {
   return false;
 }
 
-/// FindScalarElement - Given a vector and an element number, see if the scalar
-/// value is already around as a register, for example if it were inserted then
-/// extracted from the vector.
-static Value *FindScalarElement(Value *V, unsigned EltNo) {
-  assert(V->getType()->isVectorTy() && "Not looking at a vector?");
-  VectorType *VTy = cast<VectorType>(V->getType());
-  unsigned Width = VTy->getNumElements();
-  if (EltNo >= Width)  // Out of range access.
-    return UndefValue::get(VTy->getElementType());
-
-  if (Constant *C = dyn_cast<Constant>(V))
-    return C->getAggregateElement(EltNo);
-
-  if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
-    // If this is an insert to a variable element, we don't know what it is.
-    if (!isa<ConstantInt>(III->getOperand(2)))
-      return nullptr;
-    unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
-
-    // If this is an insert to the element we are looking for, return the
-    // inserted value.
-    if (EltNo == IIElt)
-      return III->getOperand(1);
-
-    // Otherwise, the insertelement doesn't modify the value, recurse on its
-    // vector input.
-    return FindScalarElement(III->getOperand(0), EltNo);
-  }
-
-  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
-    unsigned LHSWidth = SVI->getOperand(0)->getType()->getVectorNumElements();
-    int InEl = SVI->getMaskValue(EltNo);
-    if (InEl < 0)
-      return UndefValue::get(VTy->getElementType());
-    if (InEl < (int)LHSWidth)
-      return FindScalarElement(SVI->getOperand(0), InEl);
-    return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth);
-  }
-
-  // Extract a value from a vector add operation with a constant zero.
-  Value *Val = nullptr; Constant *Con = nullptr;
-  if (match(V, m_Add(m_Value(Val), m_Constant(Con)))) {
-    if (Con->getAggregateElement(EltNo)->isNullValue())
-      return FindScalarElement(Val, EltNo);
-  }
-
-  // Otherwise, we don't know.
-  return nullptr;
-}
-
 // If we have a PHI node with a vector type that has only 2 uses: feed
 // itself and be an operand of extractelement at a constant location,
 // try to replace the PHI of the vector type with a PHI of a scalar type.
@@ -178,6 +130,10 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
 }
 
 Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+  if (Value *V = SimplifyExtractElementInst(
+          EI.getVectorOperand(), EI.getIndexOperand(), DL, TLI, DT, AC))
+    return ReplaceInstUsesWith(EI, V);
+
   // If vector val is constant with all elements the same, replace EI with
   // that element.  We handle a known element # below.
   if (Constant *C = dyn_cast<Constant>(EI.getOperand(0)))
@@ -190,10 +146,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
     unsigned IndexVal = IdxC->getZExtValue();
     unsigned VectorWidth = EI.getVectorOperandType()->getNumElements();
 
-    // If this is extracting an invalid index, turn this into undef, to avoid
-    // crashing the code below.
-    if (IndexVal >= VectorWidth)
-      return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+    // InstSimplify handles cases where the index is invalid.
+    assert(IndexVal < VectorWidth);
 
     // This instruction only demands the single element from the input vector.
     // If the input vector has a single use, simplify it based on this use
@@ -209,16 +163,13 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
       }
     }
 
-    if (Value *Elt = FindScalarElement(EI.getOperand(0), IndexVal))
-      return ReplaceInstUsesWith(EI, Elt);
-
     // If the this extractelement is directly using a bitcast from a vector of
     // the same number of elements, see if we can find the source element from
     // it.  In this case, we will end up needing to bitcast the scalars.
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
       if (VectorType *VT = dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
         if (VT->getNumElements() == VectorWidth)
-          if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
+          if (Value *Elt = findScalarElement(BCI->getOperand(0), IndexVal))
             return new BitCastInst(Elt, EI.getType());
     }
 
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 2a81689f7449..fd34a244f271 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2174,16 +2174,9 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
   if (!EV.hasIndices())
     return ReplaceInstUsesWith(EV, Agg);
 
-  if (Constant *C = dyn_cast<Constant>(Agg)) {
-    if (Constant *C2 = C->getAggregateElement(*EV.idx_begin())) {
-      if (EV.getNumIndices() == 0)
-        return ReplaceInstUsesWith(EV, C2);
-      // Extract the remaining indices out of the constant indexed by the
-      // first index
-      return ExtractValueInst::Create(C2, EV.getIndices().slice(1));
-    }
-    return nullptr; // Can't handle other constants
-  }
+  if (Value *V =
+          SimplifyExtractValueInst(Agg, EV.getIndices(), DL, TLI, DT, AC))
+    return ReplaceInstUsesWith(EV, V);
 
   if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
     // We're extracting from an insertvalue instruction, compare the indices
@@ -2972,8 +2965,9 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
 
 static bool
 combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
-                                AssumptionCache &AC, TargetLibraryInfo &TLI,
-                                DominatorTree &DT, LoopInfo *LI = nullptr) {
+                                AliasAnalysis *AA, AssumptionCache &AC,
+                                TargetLibraryInfo &TLI, DominatorTree &DT,
+                                LoopInfo *LI = nullptr) {
   // Minimizing size?
   bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize);
   auto &DL = F.getParent()->getDataLayout();
@@ -2998,7 +2992,8 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
     if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist))
       Changed = true;
 
-    InstCombiner IC(Worklist, &Builder, MinimizeSize, &AC, &TLI, &DT, DL, LI);
+    InstCombiner IC(Worklist, &Builder, MinimizeSize,
+                    AA, &AC, &TLI, &DT, DL, LI);
     if (IC.run())
       Changed = true;
 
@@ -3017,7 +3012,8 @@ PreservedAnalyses InstCombinePass::run(Function &F,
 
   auto *LI = AM->getCachedResult<LoopAnalysis>(F);
 
-  if (!combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI))
+  // FIXME: The AliasAnalysis is not yet supported in the new pass manager
+  if (!combineInstructionsOverFunction(F, Worklist, nullptr, AC, TLI, DT, LI))
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
 
@@ -3050,6 +3046,7 @@ public:
 
 void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
+  AU.addRequired<AliasAnalysis>();
   AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
@@ -3061,6 +3058,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
     return false;
 
   // Required analyses.
+  auto AA = &getAnalysis<AliasAnalysis>();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -3069,7 +3067,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
   auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
 
-  return combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI);
+  return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, LI);
 }
 
 char InstructionCombiningPass::ID = 0;
@@ -3078,6 +3076,7 @@ INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
                     "Combine redundant instructions", false, false)
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 60903c8b4aaf..d1eba6e70e57 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -656,11 +656,14 @@ namespace {
       LeaderTableEntry* Prev = nullptr;
       LeaderTableEntry* Curr = &LeaderTable[N];
 
-      while (Curr->Val != I || Curr->BB != BB) {
+      while (Curr && (Curr->Val != I || Curr->BB != BB)) {
         Prev = Curr;
         Curr = Curr->Next;
       }
 
+      if (!Curr)
+        return;
+
       if (Prev) {
         Prev->Next = Curr->Next;
       } else {
@@ -1304,11 +1307,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   if (V->getType()->getScalarType()->isPointerTy()) {
     AliasAnalysis *AA = gvn.getAliasAnalysis();
 
-    for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i)
-      AA->copyValue(LI, NewPHIs[i]);
-
-    // Now that we've copied information to the new PHIs, scan through
-    // them again and inform alias analysis that we've added potentially
+    // Scan the new PHIs and inform alias analysis that we've added potentially
     // escaping uses to any values that are operands to these PHIs.
     for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) {
       PHINode *P = NewPHIs[i];
@@ -1796,7 +1795,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
     // In general, GVN unifies expressions over different control-flow
     // regions, and so we need a conservative combination of the noalias
     // scopes.
-    unsigned KnownIDs[] = {
+    static const unsigned KnownIDs[] = {
       LLVMContext::MD_tbaa,
       LLVMContext::MD_alias_scope,
       LLVMContext::MD_noalias,
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 6f0375487af6..2a954d9961f2 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -135,6 +136,10 @@ namespace {
                                      PHINode *IndVar, SCEVExpander &Rewriter);
 
     void SinkUnusedInvariants(Loop *L);
+
+    Value *ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L,
+                              Instruction *InsertPt, Type *Ty,
+                              bool &IsHighCostExpansion);
   };
 }
 
@@ -496,6 +501,52 @@ struct RewritePhi {
 };
 }
 
+Value *IndVarSimplify::ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
+                                          Loop *L, Instruction *InsertPt,
+                                          Type *ResultTy,
+                                          bool &IsHighCostExpansion) {
+  using namespace llvm::PatternMatch;
+
+  if (!Rewriter.isHighCostExpansion(S, L)) {
+    IsHighCostExpansion = false;
+    return Rewriter.expandCodeFor(S, ResultTy, InsertPt);
+  }
+
+  // Before expanding S into an expensive LLVM expression, see if we can use an
+  // already existing value as the expansion for S.  There is potential to make
+  // this significantly smarter, but this simple heuristic already gets some
+  // interesting cases.
+
+  SmallVector<BasicBlock *, 4> Latches;
+  L->getLoopLatches(Latches);
+
+  for (BasicBlock *BB : Latches) {
+    ICmpInst::Predicate Pred;
+    Instruction *LHS, *RHS;
+    BasicBlock *TrueBB, *FalseBB;
+
+    if (!match(BB->getTerminator(),
+               m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)),
+                    TrueBB, FalseBB)))
+      continue;
+
+    if (SE->getSCEV(LHS) == S && DT->dominates(LHS, InsertPt)) {
+      IsHighCostExpansion = false;
+      return LHS;
+    }
+
+    if (SE->getSCEV(RHS) == S && DT->dominates(RHS, InsertPt)) {
+      IsHighCostExpansion = false;
+      return RHS;
+    }
+  }
+
+  // We didn't find anything, fall back to using SCEVExpander.
+  assert(Rewriter.isHighCostExpansion(S, L) && "this should not have changed!");
+  IsHighCostExpansion = true;
+  return Rewriter.expandCodeFor(S, ResultTy, InsertPt);
+}
+
 //===----------------------------------------------------------------------===//
 // RewriteLoopExitValues - Optimize IV users outside the loop.
 // As a side effect, reduces the amount of IV processing within the loop.
@@ -628,7 +679,9 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
             continue;
         }
 
-        Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst);
+        bool HighCost = false;
+        Value *ExitVal = ExpandSCEVIfNeeded(Rewriter, ExitValue, L, Inst,
+                                            PN->getType(), HighCost);
 
         DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
                      << "  LoopVal = " << *Inst << "\n");
@@ -637,7 +690,6 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
           DeadInsts.push_back(ExitVal);
           continue;
         }
-        bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L);
 
         // Collect all the candidate PHINodes to be rewritten.
         RewritePhiSet.push_back(
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index f0e6d641b180..43fc50e588f8 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -602,7 +602,8 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
   // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
   // the instruction.
   while (!I.use_empty()) {
-    Instruction *User = I.user_back();
+    Value::user_iterator UI = I.user_begin();
+    auto *User = cast<Instruction>(*UI);
     if (!DT->isReachableFromEntry(User->getParent())) {
       User->replaceUsesOfWith(&I, UndefValue::get(I.getType()));
       continue;
@@ -610,6 +611,16 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
     // The user must be a PHI node.
     PHINode *PN = cast<PHINode>(User);
 
+    // Surprisingly, instructions can be used outside of loops without any
+    // exits.  This can only happen in PHI nodes if the incoming block is
+    // unreachable.
+    Use &U = UI.getUse();
+    BasicBlock *BB = PN->getIncomingBlock(U);
+    if (!DT->isReachableFromEntry(BB)) {
+      U = UndefValue::get(I.getType());
+      continue;
+    }
+
     BasicBlock *ExitBlock = PN->getParent();
     assert(ExitBlockSet.count(ExitBlock) &&
            "The LCSSA PHI is not in an exit block!");
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index 0325d268c325..1b9859b57790 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
 #include <list>
 
 #define LDIST_NAME "loop-distribute"
@@ -55,70 +56,6 @@ static cl::opt<bool> DistributeNonIfConvertible(
 
 STATISTIC(NumLoopsDistributed, "Number of loops distributed");
 
-/// \brief Remaps instructions in a loop including the preheader.
-static void remapInstructionsInLoop(const SmallVectorImpl<BasicBlock *> &Blocks,
-                                    ValueToValueMapTy &VMap) {
-  // Rewrite the code to refer to itself.
-  for (auto *BB : Blocks)
-    for (auto &Inst : *BB)
-      RemapInstruction(&Inst, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
-}
-
-/// \brief Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
-/// Blocks.
-///
-/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
-/// \p LoopDomBB.  Insert the new blocks before block specified in \p Before.
-static Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
-                                    Loop *OrigLoop, ValueToValueMapTy &VMap,
-                                    const Twine &NameSuffix, LoopInfo *LI,
-                                    DominatorTree *DT,
-                                    SmallVectorImpl<BasicBlock *> &Blocks) {
-  Function *F = OrigLoop->getHeader()->getParent();
-  Loop *ParentLoop = OrigLoop->getParentLoop();
-
-  Loop *NewLoop = new Loop();
-  if (ParentLoop)
-    ParentLoop->addChildLoop(NewLoop);
-  else
-    LI->addTopLevelLoop(NewLoop);
-
-  BasicBlock *OrigPH = OrigLoop->getLoopPreheader();
-  BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F);
-  // To rename the loop PHIs.
-  VMap[OrigPH] = NewPH;
-  Blocks.push_back(NewPH);
-
-  // Update LoopInfo.
-  if (ParentLoop)
-    ParentLoop->addBasicBlockToLoop(NewPH, *LI);
-
-  // Update DominatorTree.
-  DT->addNewBlock(NewPH, LoopDomBB);
-
-  for (BasicBlock *BB : OrigLoop->getBlocks()) {
-    BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
-    VMap[BB] = NewBB;
-
-    // Update LoopInfo.
-    NewLoop->addBasicBlockToLoop(NewBB, *LI);
-
-    // Update DominatorTree.
-    BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
-    DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
-
-    Blocks.push_back(NewBB);
-  }
-
-  // Move them physically from the end of the block list.
-  F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH);
-  F->getBasicBlockList().splice(Before, F->getBasicBlockList(),
-                                NewLoop->getHeader(), F->end());
-
-  return NewLoop;
-}
-
 namespace {
 /// \brief Maintains the set of instructions of the loop for a partition before
 /// cloning.  After cloning, it hosts the new loop.
@@ -204,7 +141,9 @@ public:
   ValueToValueMapTy &getVMap() { return VMap; }
 
   /// \brief Remaps the cloned instructions using VMap.
-  void remapInstructions() { remapInstructionsInLoop(ClonedLoopBlocks, VMap); }
+  void remapInstructions() {
+    remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
+  }
 
   /// \brief Based on the set of instructions selected for this partition,
   /// removes the unnecessary ones.
@@ -493,15 +432,14 @@ public:
   /// partitions its entry is set to -1.
   SmallVector<int, 8>
   computePartitionSetForPointers(const LoopAccessInfo &LAI) {
-    const LoopAccessInfo::RuntimePointerCheck *RtPtrCheck =
-        LAI.getRuntimePointerCheck();
+    const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
 
     unsigned N = RtPtrCheck->Pointers.size();
     SmallVector<int, 8> PtrToPartitions(N);
     for (unsigned I = 0; I < N; ++I) {
-      Value *Ptr = RtPtrCheck->Pointers[I];
+      Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
       auto Instructions =
-          LAI.getInstructionsForAccess(Ptr, RtPtrCheck->IsWritePtr[I]);
+          LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
 
       int &Partition = PtrToPartitions[I];
       // First set it to uninitialized.
@@ -629,121 +567,6 @@ private:
   AccessesType Accesses;
 };
 
-/// \brief Handles the loop versioning based on memchecks.
-class LoopVersioning {
-public:
-  LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
-                 DominatorTree *DT,
-                 const SmallVector<int, 8> *PtrToPartition = nullptr)
-      : VersionedLoop(L), NonVersionedLoop(nullptr),
-        PtrToPartition(PtrToPartition), LAI(LAI), LI(LI), DT(DT) {}
-
-  /// \brief Returns true if we need memchecks to disambiguate may-aliasing
-  /// accesses.
-  bool needsRuntimeChecks() const {
-    return LAI.getRuntimePointerCheck()->needsAnyChecking(PtrToPartition);
-  }
-
-  /// \brief Performs the CFG manipulation part of versioning the loop including
-  /// the DominatorTree and LoopInfo updates.
-  void versionLoop(Pass *P) {
-    Instruction *FirstCheckInst;
-    Instruction *MemRuntimeCheck;
-    // Add the memcheck in the original preheader (this is empty initially).
-    BasicBlock *MemCheckBB = VersionedLoop->getLoopPreheader();
-    std::tie(FirstCheckInst, MemRuntimeCheck) =
-        LAI.addRuntimeCheck(MemCheckBB->getTerminator(), PtrToPartition);
-    assert(MemRuntimeCheck && "called even though needsAnyChecking = false");
-
-    // Rename the block to make the IR more readable.
-    MemCheckBB->setName(VersionedLoop->getHeader()->getName() +
-                        ".lver.memcheck");
-
-    // Create empty preheader for the loop (and after cloning for the
-    // non-versioned loop).
-    BasicBlock *PH =
-        SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI);
-    PH->setName(VersionedLoop->getHeader()->getName() + ".ph");
-
-    // Clone the loop including the preheader.
-    //
-    // FIXME: This does not currently preserve SimplifyLoop because the exit
-    // block is a join between the two loops.
-    SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks;
-    NonVersionedLoop =
-        cloneLoopWithPreheader(PH, MemCheckBB, VersionedLoop, VMap,
-                               ".lver.orig", LI, DT, NonVersionedLoopBlocks);
-    remapInstructionsInLoop(NonVersionedLoopBlocks, VMap);
-
-    // Insert the conditional branch based on the result of the memchecks.
-    Instruction *OrigTerm = MemCheckBB->getTerminator();
-    BranchInst::Create(NonVersionedLoop->getLoopPreheader(),
-                       VersionedLoop->getLoopPreheader(), MemRuntimeCheck,
-                       OrigTerm);
-    OrigTerm->eraseFromParent();
-
-    // The loops merge in the original exit block.  This is now dominated by the
-    // memchecking block.
-    DT->changeImmediateDominator(VersionedLoop->getExitBlock(), MemCheckBB);
-  }
-
-  /// \brief Adds the necessary PHI nodes for the versioned loops based on the
-  /// loop-defined values used outside of the loop.
-  void addPHINodes(const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
-    BasicBlock *PHIBlock = VersionedLoop->getExitBlock();
-    assert(PHIBlock && "No single successor to loop exit block");
-
-    for (auto *Inst : DefsUsedOutside) {
-      auto *NonVersionedLoopInst = cast<Instruction>(VMap[Inst]);
-      PHINode *PN;
-
-      // First see if we have a single-operand PHI with the value defined by the
-      // original loop.
-      for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
-        assert(PN->getNumOperands() == 1 &&
-               "Exit block should only have on predecessor");
-        if (PN->getIncomingValue(0) == Inst)
-          break;
-      }
-      // If not create it.
-      if (!PN) {
-        PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver",
-                             PHIBlock->begin());
-        for (auto *User : Inst->users())
-          if (!VersionedLoop->contains(cast<Instruction>(User)->getParent()))
-            User->replaceUsesOfWith(Inst, PN);
-        PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
-      }
-      // Add the new incoming value from the non-versioned loop.
-      PN->addIncoming(NonVersionedLoopInst,
-                      NonVersionedLoop->getExitingBlock());
-    }
-  }
-
-private:
-  /// \brief The original loop.  This becomes the "versioned" one, i.e. control
-  /// goes if the memchecks all pass.
-  Loop *VersionedLoop;
-  /// \brief The fall-back loop, i.e. if any of the memchecks fail.
-  Loop *NonVersionedLoop;
-
-  /// \brief For each memory pointer it contains the partitionId it is used in.
-  /// If nullptr, no partitioning is used.
-  ///
-  /// The I-th entry corresponds to I-th entry in LAI.getRuntimePointerCheck().
-  /// If the pointer is used in multiple partitions the entry is set to -1.
-  const SmallVector<int, 8> *PtrToPartition;
-
-  /// \brief This maps the instructions from VersionedLoop to their counterpart
-  /// in NonVersionedLoop.
-  ValueToValueMapTy VMap;
-
-  /// \brief Analyses used.
-  const LoopAccessInfo &LAI;
-  LoopInfo *LI;
-  DominatorTree *DT;
-};
-
 /// \brief Returns the instructions that use values defined in the loop.
 static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) {
   SmallVector<Instruction *, 8> UsedOutside;
@@ -929,7 +752,7 @@ private:
     LoopVersioning LVer(LAI, L, LI, DT, &PtrToPartition);
     if (LVer.needsRuntimeChecks()) {
       DEBUG(dbgs() << "\nPointers:\n");
-      DEBUG(LAI.getRuntimePointerCheck()->print(dbgs(), 0, &PtrToPartition));
+      DEBUG(LAI.getRuntimePointerChecking()->print(dbgs(), 0, &PtrToPartition));
       LVer.versionLoop(this);
       LVer.addPHINodes(DefsUsedOutside);
     }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 714ce914a8b3..a21ca2417ca1 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -508,7 +508,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst,
 
     ICmpInst *NewPreCond =
       cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
-    PreCond->replaceAllUsesWith(NewPreCond);
+    PreCondBr->setCondition(NewPreCond);
 
     RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
   }
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 25546553fd4d..9d7e57ffebac 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -282,21 +282,21 @@ static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
   DEBUG(dbgs() << "Calling populateWorklist called\n");
   LoopVector LoopList;
   Loop *CurrentLoop = &L;
-  std::vector<Loop *> vec = CurrentLoop->getSubLoopsVector();
-  while (vec.size() != 0) {
+  const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
+  while (!Vec->empty()) {
     // The current loop has multiple subloops in it hence it is not tightly
     // nested.
     // Discard all loops above it added into Worklist.
-    if (vec.size() != 1) {
+    if (Vec->size() != 1) {
       LoopList.clear();
       return;
     }
     LoopList.push_back(CurrentLoop);
-    CurrentLoop = *(vec.begin());
-    vec = CurrentLoop->getSubLoopsVector();
+    CurrentLoop = Vec->front();
+    Vec = &CurrentLoop->getSubLoops();
   }
   LoopList.push_back(CurrentLoop);
-  V.push_back(LoopList);
+  V.push_back(std::move(LoopList));
 }
 
 static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 9e7558d9c45f..d78db6c369b3 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -840,8 +840,10 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Reduce count based on the type of unrolling and the threshold values.
   unsigned OriginalCount = Count;
-  bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime;
-  if (HasRuntimeUnrollDisablePragma(L)) {
+  bool AllowRuntime =
+      (PragmaCount > 0) || (UserRuntime ? CurrentRuntime : UP.Runtime);
+  // Don't unroll a runtime trip count loop with unroll full pragma.
+  if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) {
     AllowRuntime = false;
   }
   if (Unrolling == Partial) {
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 243db8d70ca2..643f3740eedd 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -301,10 +301,6 @@ void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,
   // Merged instruction
   Instruction *HoistedInst = HoistCand->clone();
 
-  // Notify AA of the new value.
-  if (isa<LoadInst>(HoistCand))
-    AA->copyValue(HoistCand, HoistedInst);
-
   // Hoist instruction.
   HoistedInst->insertBefore(HoistPt);
 
@@ -451,9 +447,6 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
     NewPN->addIncoming(Opd1, S0->getParent());
     NewPN->addIncoming(Opd2, S1->getParent());
     if (NewPN->getType()->getScalarType()->isPointerTy()) {
-      // Notify AA of the new value.
-      AA->copyValue(Opd1, NewPN);
-      AA->copyValue(Opd2, NewPN);
       // AA needs to be informed when a PHI-use of the pointer value is added
       for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) {
         unsigned J = PHINode::getOperandNumForIncomingValue(I);
@@ -491,7 +484,6 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
     // Create the new store to be inserted at the join point.
     StoreInst *SNew = (StoreInst *)(S0->clone());
     Instruction *ANew = A0->clone();
-    AA->copyValue(S0, SNew);
     SNew->insertBefore(InsertPt);
     ANew->insertBefore(SNew);
 
diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 9ecaf102574a..366301ad731a 100644
--- a/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -399,8 +399,8 @@ static bool doesNotRequireEntrySafepointBefore(const CallSite &CS) {
       // at least if they do, are leaf functions that cause only finite stack
       // growth.  In particular, the optimizer likes to form things like memsets
       // out of stores in the original IR.  Another important example is
-      // llvm.frameescape which must occur in the entry block.  Inserting a
-      // safepoint before it is not legal since it could push the frameescape
+      // llvm.localescape which must occur in the entry block.  Inserting a
+      // safepoint before it is not legal since it could push the localescape
       // out of the entry block.
       return true;
     }
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 305175ff8f73..4d3a708fa20e 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -1799,11 +1799,10 @@ bool IPSCCP::runOnModule(Module &M) {
         if (!TI->use_empty())
           TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
         TI->eraseFromParent();
+        new UnreachableInst(M.getContext(), BB);
 
         if (&*BB != &F->front())
           BlocksToErase.push_back(BB);
-        else
-          new UnreachableInst(M.getContext(), BB);
         continue;
       }
 
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 056dd11b5ab3..d1a0a82b9b08 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -2593,13 +2593,21 @@ private:
       V = rewriteIntegerLoad(LI);
     } else if (NewBeginOffset == NewAllocaBeginOffset &&
                canConvertValue(DL, NewAllocaTy, LI.getType())) {
-      V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(),
-                                LI.getName());
+      LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                              LI.isVolatile(), LI.getName());
+      if (LI.isVolatile())
+        NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+
+      V = NewLI;
     } else {
       Type *LTy = TargetTy->getPointerTo();
-      V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
-                                getSliceAlign(TargetTy), LI.isVolatile(),
-                                LI.getName());
+      LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
+                                              getSliceAlign(TargetTy),
+                                              LI.isVolatile(), LI.getName());
+      if (LI.isVolatile())
+        NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+
+      V = NewLI;
       IsPtrAdjusted = true;
     }
     V = convertValue(DL, IRB, V, TargetTy);
@@ -2722,7 +2730,8 @@ private:
       NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
                                      SI.isVolatile());
     }
-    (void)NewSI;
+    if (SI.isVolatile())
+      NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope());
     Pass.DeadInsts.insert(&SI);
     deleteIfTriviallyDead(OldOp);
 
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 53471de6154c..ef7dacac79cb 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -440,8 +440,6 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
     // Create the new PHI node, insert it into NewBB at the end of the block
     PHINode *NewPHI =
         PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
-    if (AA)
-      AA->copyValue(PN, NewPHI);
 
     // NOTE! This loop walks backwards for a reason! First off, this minimizes
     // the cost of removal if we end up removing a large number of values, and
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 470e2d09132e..716e655affb9 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_library(LLVMTransformUtils
   LoopUnroll.cpp
   LoopUnrollRuntime.cpp
   LoopUtils.cpp
+  LoopVersioning.cpp
   LowerInvoke.cpp
   LowerSwitch.cpp
   Mem2Reg.cpp
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 4f8d1dfbe5df..cc4d6c6fb192 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -720,3 +721,68 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                             ModuleLevelChanges, Returns, NameSuffix, CodeInfo,
                             nullptr);
 }
+
+/// \brief Remaps instructions in \p Blocks using the mapping in \p VMap.
+void llvm::remapInstructionsInBlocks(
+    const SmallVectorImpl<BasicBlock *> &Blocks, ValueToValueMapTy &VMap) {
+  // Rewrite the code to refer to itself.
+  for (auto *BB : Blocks)
+    for (auto &Inst : *BB)
+      RemapInstruction(&Inst, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+}
+
+/// \brief Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
+/// Blocks.
+///
+/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
+/// \p LoopDomBB.  Insert the new blocks before block specified in \p Before.
+Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
+                                   Loop *OrigLoop, ValueToValueMapTy &VMap,
+                                   const Twine &NameSuffix, LoopInfo *LI,
+                                   DominatorTree *DT,
+                                   SmallVectorImpl<BasicBlock *> &Blocks) {
+  Function *F = OrigLoop->getHeader()->getParent();
+  Loop *ParentLoop = OrigLoop->getParentLoop();
+
+  Loop *NewLoop = new Loop();
+  if (ParentLoop)
+    ParentLoop->addChildLoop(NewLoop);
+  else
+    LI->addTopLevelLoop(NewLoop);
+
+  BasicBlock *OrigPH = OrigLoop->getLoopPreheader();
+  assert(OrigPH && "No preheader");
+  BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F);
+  // To rename the loop PHIs.
+  VMap[OrigPH] = NewPH;
+  Blocks.push_back(NewPH);
+
+  // Update LoopInfo.
+  if (ParentLoop)
+    ParentLoop->addBasicBlockToLoop(NewPH, *LI);
+
+  // Update DominatorTree.
+  DT->addNewBlock(NewPH, LoopDomBB);
+
+  for (BasicBlock *BB : OrigLoop->getBlocks()) {
+    BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
+    VMap[BB] = NewBB;
+
+    // Update LoopInfo.
+    NewLoop->addBasicBlockToLoop(NewBB, *LI);
+
+    // Update DominatorTree.
+    BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
+    DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
+
+    Blocks.push_back(NewBB);
+  }
+
+  // Move them physically from the end of the block list.
+  F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH);
+  F->getBasicBlockList().splice(Before, F->getBasicBlockList(),
+                                NewLoop->getHeader(), F->end());
+
+  return NewLoop;
+}
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 56085579b61c..50ca6234d0b7 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -900,13 +900,10 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
 
   if (auto *GO = dyn_cast<GlobalObject>(V)) {
     // If there is a large requested alignment and we can, bump up the alignment
-    // of the global.
-    if (GO->isDeclaration())
-      return Align;
-    // If the memory we set aside for the global may not be the memory used by
-    // the final program then it is impossible for us to reliably enforce the
-    // preferred alignment.
-    if (GO->isWeakForLinker())
+    // of the global.  If the memory we set aside for the global may not be the
+    // memory used by the final program then it is impossible for us to reliably
+    // enforce the preferred alignment.
+    if (!GO->isStrongDefinitionForLinker())
       return Align;
 
     if (GO->getAlignment() >= PrefAlign)
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 2e7d21cb171f..5c98043e4632 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -403,7 +403,6 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
     PHINode *PN = cast<PHINode>(I);
     PHINode *NewPN = PHINode::Create(PN->getType(), BackedgeBlocks.size(),
                                      PN->getName()+".be", BETerminator);
-    if (AA) AA->copyValue(PN, NewPN);
 
     // Loop over the PHI node, moving all entries except the one for the
     // preheader over to the new PHI node.
diff --git a/lib/Transforms/Utils/LoopVersioning.cpp b/lib/Transforms/Utils/LoopVersioning.cpp
new file mode 100644
index 000000000000..832079d2cf63
--- /dev/null
+++ b/lib/Transforms/Utils/LoopVersioning.cpp
@@ -0,0 +1,106 @@
+//===- LoopVersioning.cpp - Utility to version a loop ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a utility class to perform loop versioning.  The versioned
+// loop speculates that otherwise may-aliasing memory accesses don't overlap and
+// emits checks to prove this.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+
+using namespace llvm;
+
+LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
+                               DominatorTree *DT,
+                               const SmallVector<int, 8> *PtrToPartition)
+    : VersionedLoop(L), NonVersionedLoop(nullptr),
+      PtrToPartition(PtrToPartition), LAI(LAI), LI(LI), DT(DT) {
+  assert(L->getExitBlock() && "No single exit block");
+  assert(L->getLoopPreheader() && "No preheader");
+}
+
+bool LoopVersioning::needsRuntimeChecks() const {
+  return LAI.getRuntimePointerChecking()->needsAnyChecking(PtrToPartition);
+}
+
+void LoopVersioning::versionLoop(Pass *P) {
+  Instruction *FirstCheckInst;
+  Instruction *MemRuntimeCheck;
+  // Add the memcheck in the original preheader (this is empty initially).
+  BasicBlock *MemCheckBB = VersionedLoop->getLoopPreheader();
+  std::tie(FirstCheckInst, MemRuntimeCheck) =
+      LAI.addRuntimeCheck(MemCheckBB->getTerminator(), PtrToPartition);
+  assert(MemRuntimeCheck && "called even though needsAnyChecking = false");
+
+  // Rename the block to make the IR more readable.
+  MemCheckBB->setName(VersionedLoop->getHeader()->getName() + ".lver.memcheck");
+
+  // Create empty preheader for the loop (and after cloning for the
+  // non-versioned loop).
+  BasicBlock *PH = SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI);
+  PH->setName(VersionedLoop->getHeader()->getName() + ".ph");
+
+  // Clone the loop including the preheader.
+  //
+  // FIXME: This does not currently preserve SimplifyLoop because the exit
+  // block is a join between the two loops.
+  SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks;
+  NonVersionedLoop =
+      cloneLoopWithPreheader(PH, MemCheckBB, VersionedLoop, VMap, ".lver.orig",
+                             LI, DT, NonVersionedLoopBlocks);
+  remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap);
+
+  // Insert the conditional branch based on the result of the memchecks.
+  Instruction *OrigTerm = MemCheckBB->getTerminator();
+  BranchInst::Create(NonVersionedLoop->getLoopPreheader(),
+                     VersionedLoop->getLoopPreheader(), MemRuntimeCheck,
+                     OrigTerm);
+  OrigTerm->eraseFromParent();
+
+  // The loops merge in the original exit block.  This is now dominated by the
+  // memchecking block.
+  DT->changeImmediateDominator(VersionedLoop->getExitBlock(), MemCheckBB);
+}
+
+void LoopVersioning::addPHINodes(
+    const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
+  BasicBlock *PHIBlock = VersionedLoop->getExitBlock();
+  assert(PHIBlock && "No single successor to loop exit block");
+
+  for (auto *Inst : DefsUsedOutside) {
+    auto *NonVersionedLoopInst = cast<Instruction>(VMap[Inst]);
+    PHINode *PN;
+
+    // First see if we have a single-operand PHI with the value defined by the
+    // original loop.
+    for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
+      assert(PN->getNumOperands() == 1 &&
+             "Exit block should only have on predecessor");
+      if (PN->getIncomingValue(0) == Inst)
+        break;
+    }
+    // If not create it.
+    if (!PN) {
+      PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver",
+                           PHIBlock->begin());
+      for (auto *User : Inst->users())
+        if (!VersionedLoop->contains(cast<Instruction>(User)->getParent()))
+          User->replaceUsesOfWith(Inst, PN);
+      PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
+    }
+    // Add the new incoming value from the non-versioned loop.
+    PN->addIncoming(NonVersionedLoopInst, NonVersionedLoop->getExitingBlock());
+  }
+}
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5ba14174ac79..69ca2688c810 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -148,8 +148,9 @@ static cl::opt<unsigned> MaxInterleaveGroupFactor(
     cl::desc("Maximum factor for an interleaved access group (default = 8)"),
     cl::init(8));
 
-/// We don't unroll loops with a known constant trip count below this number.
-static const unsigned TinyTripCountUnrollThreshold = 128;
+/// We don't interleave loops with a known constant trip count below this
+/// number.
+static const unsigned TinyTripCountInterleaveThreshold = 128;
 
 static cl::opt<unsigned> ForceTargetNumScalarRegs(
     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
@@ -180,7 +181,8 @@ static cl::opt<unsigned> ForceTargetInstructionCost(
 
 static cl::opt<unsigned> SmallLoopCost(
     "small-loop-cost", cl::init(20), cl::Hidden,
-    cl::desc("The cost of a loop that is considered 'small' by the unroller."));
+    cl::desc(
+        "The cost of a loop that is considered 'small' by the interleaver."));
 
 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
     "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
@@ -188,10 +190,11 @@ static cl::opt<bool> LoopVectorizeWithBlockFrequency(
              "heuristics minimizing code growth in cold regions and being more "
              "aggressive in hot regions."));
 
-// Runtime unroll loops for load/store throughput.
-static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
-    "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,
-    cl::desc("Enable runtime unrolling until load/store ports are saturated"));
+// Runtime interleave loops for load/store throughput.
+static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
+    "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
+    cl::desc(
+        "Enable runtime interleaving until load/store ports are saturated"));
 
 /// The number of stores in a loop that are allowed to need predication.
 static cl::opt<unsigned> NumberOfStoresToPredicate(
@@ -200,15 +203,15 @@ static cl::opt<unsigned> NumberOfStoresToPredicate(
 
 static cl::opt<bool> EnableIndVarRegisterHeur(
     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
-    cl::desc("Count the induction variable only once when unrolling"));
+    cl::desc("Count the induction variable only once when interleaving"));
 
 static cl::opt<bool> EnableCondStoresVectorization(
     "enable-cond-stores-vec", cl::init(false), cl::Hidden,
     cl::desc("Enable if predication of stores during vectorization."));
 
-static cl::opt<unsigned> MaxNestedScalarReductionUF(
-    "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden,
-    cl::desc("The maximum unroll factor to use when unrolling a scalar "
+static cl::opt<unsigned> MaxNestedScalarReductionIC(
+    "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
+    cl::desc("The maximum interleave count to use when interleaving a scalar "
              "reduction in a nested loop."));
 
 namespace {
@@ -921,8 +924,8 @@ public:
   bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
 
   /// Returns the information that we collected about runtime memory check.
-  const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const {
-    return LAI->getRuntimePointerCheck();
+  const RuntimePointerChecking *getRuntimePointerChecking() const {
+    return LAI->getRuntimePointerChecking();
   }
 
   const LoopAccessInfo *getLAI() const {
@@ -1105,12 +1108,19 @@ public:
   /// 64 bit loop indices.
   unsigned getWidestType();
 
+  /// \return The desired interleave count.
+  /// If interleave count has been specified by metadata it will be returned.
+  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
+  /// are the selected vectorization factor and the cost of the selected VF.
+  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
+                                 unsigned LoopCost);
+
   /// \return The most profitable unroll factor.
-  /// If UserUF is non-zero then this method finds the best unroll-factor
-  /// based on register pressure and other parameters.
-  /// VF and LoopCost are the selected vectorization factor and the cost of the
-  /// selected VF.
-  unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost);
+  /// This method finds the best unroll-factor based on register pressure and
+  /// other parameters. VF and LoopCost are the selected vectorization factor
+  /// and the cost of the selected VF.
+  unsigned computeInterleaveCount(bool OptForSize, unsigned VF,
+                                  unsigned LoopCost);
 
   /// \brief A struct that represents some properties of the register usage
   /// of a loop.
@@ -1456,9 +1466,14 @@ struct LoopVectorize : public FunctionPass {
     const BranchProbability ColdProb(1, 5); // 20%
     ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
 
-    // If the target claims to have no vector registers don't attempt
-    // vectorization.
-    if (!TTI->getNumberOfRegisters(true))
+    // Don't attempt if
+    // 1. the target claims to have no vector registers, and
+    // 2. interleaving won't help ILP.
+    //
+    // The second condition is necessary because, even if the target has no
+    // vector registers, loop vectorization may still enable scalar
+    // interleaving.
+    if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
       return false;
 
     // Build up a worklist of inner-loops to vectorize. This is necessary as
@@ -1633,18 +1648,17 @@ struct LoopVectorize : public FunctionPass {
     const LoopVectorizationCostModel::VectorizationFactor VF =
         CM.selectVectorizationFactor(OptForSize);
 
-    // Select the unroll factor.
-    const unsigned UF =
-        CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost);
+    // Select the interleave count.
+    unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
 
     DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
                  << DebugLocStr << '\n');
-    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
+    DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
 
     if (VF.Width == 1) {
       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
 
-      if (UF == 1) {
+      if (IC == 1) {
         emitOptimizationRemarkAnalysis(
             F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
             "not beneficial to vectorize and user disabled interleaving");
@@ -1654,17 +1668,14 @@ struct LoopVectorize : public FunctionPass {
 
       // Report the unrolling decision.
       emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
-                             Twine("unrolled with interleaving factor " +
-                                   Twine(UF) +
+                             Twine("interleaved by " + Twine(IC) +
                                    " (vectorization not beneficial)"));
 
-      // We decided not to vectorize, but we may want to unroll.
-
-      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF);
+      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC);
       Unroller.vectorize(&LVL);
     } else {
       // If we decided that it is *legal* to vectorize the loop then do it.
-      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF);
+      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC);
       LB.vectorize(&LVL);
       ++LoopsVectorized;
 
@@ -1675,10 +1686,10 @@ struct LoopVectorize : public FunctionPass {
         AddRuntimeUnrollDisableMetaData(L);
 
       // Report the vectorization decision.
-      emitOptimizationRemark(
-          F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
-          Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) +
-              ", unrolling interleave factor: " + Twine(UF) + ")");
+      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+                             Twine("vectorized loop (vectorization width: ") +
+                                 Twine(VF.Width) + ", interleaved count: " +
+                                 Twine(IC) + ")");
     }
 
     // Mark the loop as already vectorized to avoid vectorizing again.
@@ -1760,31 +1771,6 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
   return Builder.CreateAdd(Val, Step, "induction");
 }
 
-/// \brief Find the operand of the GEP that should be checked for consecutive
-/// stores. This ignores trailing indices that have no effect on the final
-/// pointer.
-static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
-  const DataLayout &DL = Gep->getModule()->getDataLayout();
-  unsigned LastOperand = Gep->getNumOperands() - 1;
-  unsigned GEPAllocSize = DL.getTypeAllocSize(
-      cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
-
-  // Walk backwards and try to peel off zeros.
-  while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
-    // Find the type we're currently indexing into.
-    gep_type_iterator GEPTI = gep_type_begin(Gep);
-    std::advance(GEPTI, LastOperand - 1);
-
-    // If it's a type with the same allocation size as the result of the GEP we
-    // can peel off the zero index.
-    if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize)
-      break;
-    --LastOperand;
-  }
-
-  return LastOperand;
-}
-
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
   // Make sure that the pointer does not point to structs.
@@ -2503,9 +2489,9 @@ void InnerLoopVectorizer::createEmptyLoop() {
    */
 
   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
-  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
+  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
-  assert(BypassBlock && "Invalid loop structure");
+  assert(VectorPH && "Invalid loop structure");
   assert(ExitBlock && "Must have an exit block");
 
   // Some loops have a single integer induction variable, while other loops
@@ -2545,44 +2531,35 @@ void InnerLoopVectorizer::createEmptyLoop() {
   // loop.
   Value *BackedgeCount =
       Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
-                        BypassBlock->getTerminator());
+                        VectorPH->getTerminator());
   if (BackedgeCount->getType()->isPointerTy())
     BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
                                                 "backedge.ptrcnt.to.int",
-                                                BypassBlock->getTerminator());
+                                                VectorPH->getTerminator());
   Instruction *CheckBCOverflow =
       CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
                       Constant::getAllOnesValue(BackedgeCount->getType()),
-                      "backedge.overflow", BypassBlock->getTerminator());
+                      "backedge.overflow", VectorPH->getTerminator());
 
   // The loop index does not have to start at Zero. Find the original start
   // value from the induction PHI node. If we don't have an induction variable
   // then we know that it starts at zero.
-  Builder.SetInsertPoint(BypassBlock->getTerminator());
-  Value *StartIdx = ExtendedIdx = OldInduction ?
-    Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
-                       IdxTy):
-    ConstantInt::get(IdxTy, 0);
-
-  // We need an instruction to anchor the overflow check on. StartIdx needs to
-  // be defined before the overflow check branch. Because the scalar preheader
-  // is going to merge the start index and so the overflow branch block needs to
-  // contain a definition of the start index.
-  Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(
-      StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",
-      BypassBlock->getTerminator());
+  Builder.SetInsertPoint(VectorPH->getTerminator());
+  Value *StartIdx = ExtendedIdx =
+      OldInduction
+          ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH),
+                               IdxTy)
+          : ConstantInt::get(IdxTy, 0);
 
   // Count holds the overall loop count (N).
   Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
-                                   BypassBlock->getTerminator());
+                                   VectorPH->getTerminator());
 
-  LoopBypassBlocks.push_back(BypassBlock);
+  LoopBypassBlocks.push_back(VectorPH);
 
   // Split the single block loop into the two loop structure described above.
-  BasicBlock *VectorPH =
-  BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
   BasicBlock *VecBody =
-  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
   BasicBlock *MiddleBlock =
   VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
   BasicBlock *ScalarPH =
@@ -2597,7 +2574,6 @@ void InnerLoopVectorizer::createEmptyLoop() {
   if (ParentLoop) {
     ParentLoop->addChildLoop(Lp);
     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
-    ParentLoop->addBasicBlockToLoop(VectorPH, *LI);
     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
   } else {
     LI->addTopLevelLoop(Lp);
@@ -2615,9 +2591,20 @@ void InnerLoopVectorizer::createEmptyLoop() {
   // times the unroll factor (num of SIMD instructions).
   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
 
+  // Generate code to check that the loop's trip count that we computed by
+  // adding one to the backedge-taken count will not overflow.
+  BasicBlock *NewVectorPH =
+      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked");
+  if (ParentLoop)
+    ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
+  ReplaceInstWithInst(
+      VectorPH->getTerminator(),
+      BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow));
+  VectorPH = NewVectorPH;
+
   // This is the IR builder that we use to add all of the logic for bypassing
   // the new vector loop.
-  IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
+  IRBuilder<> BypassBuilder(VectorPH->getTerminator());
   setDebugLocFromInst(BypassBuilder,
                       getDebugLocFromInstOrOperands(OldInduction));
 
@@ -2646,24 +2633,14 @@ void InnerLoopVectorizer::createEmptyLoop() {
   // jump to the scalar loop.
   Value *Cmp =
       BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
-
-  BasicBlock *LastBypassBlock = BypassBlock;
-
-  // Generate code to check that the loops trip count that we computed by adding
-  // one to the backedge-taken count will not overflow.
-  {
-    auto PastOverflowCheck =
-        std::next(BasicBlock::iterator(OverflowCheckAnchor));
-    BasicBlock *CheckBlock =
-      LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
-    if (ParentLoop)
-      ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
-    LoopBypassBlocks.push_back(CheckBlock);
-    ReplaceInstWithInst(
-        LastBypassBlock->getTerminator(),
-        BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow));
-    LastBypassBlock = CheckBlock;
-  }
+  NewVectorPH =
+      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
+  if (ParentLoop)
+    ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
+  LoopBypassBlocks.push_back(VectorPH);
+  ReplaceInstWithInst(VectorPH->getTerminator(),
+                      BranchInst::Create(MiddleBlock, NewVectorPH, Cmp));
+  VectorPH = NewVectorPH;
 
   // Generate the code to check that the strides we assumed to be one are really
   // one. We want the new basic block to start at the first instruction in a
@@ -2671,23 +2648,24 @@ void InnerLoopVectorizer::createEmptyLoop() {
   Instruction *StrideCheck;
   Instruction *FirstCheckInst;
   std::tie(FirstCheckInst, StrideCheck) =
-      addStrideCheck(LastBypassBlock->getTerminator());
+      addStrideCheck(VectorPH->getTerminator());
   if (StrideCheck) {
     AddedSafetyChecks = true;
     // Create a new block containing the stride check.
-    BasicBlock *CheckBlock =
-        LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
+    VectorPH->setName("vector.stridecheck");
+    NewVectorPH =
+        VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
     if (ParentLoop)
-      ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
-    LoopBypassBlocks.push_back(CheckBlock);
+      ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
+    LoopBypassBlocks.push_back(VectorPH);
 
     // Replace the branch into the memory check block with a conditional branch
     // for the "few elements case".
-    ReplaceInstWithInst(LastBypassBlock->getTerminator(),
-                        BranchInst::Create(MiddleBlock, CheckBlock, Cmp));
+    ReplaceInstWithInst(
+        VectorPH->getTerminator(),
+        BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck));
 
-    Cmp = StrideCheck;
-    LastBypassBlock = CheckBlock;
+    VectorPH = NewVectorPH;
   }
 
   // Generate the code that checks in runtime if arrays overlap. We put the
@@ -2695,28 +2673,26 @@ void InnerLoopVectorizer::createEmptyLoop() {
   // faster.
   Instruction *MemRuntimeCheck;
   std::tie(FirstCheckInst, MemRuntimeCheck) =
-    Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator());
+      Legal->getLAI()->addRuntimeCheck(VectorPH->getTerminator());
   if (MemRuntimeCheck) {
     AddedSafetyChecks = true;
     // Create a new block containing the memory check.
-    BasicBlock *CheckBlock =
-        LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck");
+    VectorPH->setName("vector.memcheck");
+    NewVectorPH =
+        VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
     if (ParentLoop)
-      ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
-    LoopBypassBlocks.push_back(CheckBlock);
+      ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
+    LoopBypassBlocks.push_back(VectorPH);
 
     // Replace the branch into the memory check block with a conditional branch
     // for the "few elements case".
-    ReplaceInstWithInst(LastBypassBlock->getTerminator(),
-                        BranchInst::Create(MiddleBlock, CheckBlock, Cmp));
+    ReplaceInstWithInst(
+        VectorPH->getTerminator(),
+        BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck));
 
-    Cmp = MemRuntimeCheck;
-    LastBypassBlock = CheckBlock;
+    VectorPH = NewVectorPH;
   }
 
-  ReplaceInstWithInst(LastBypassBlock->getTerminator(),
-                      BranchInst::Create(MiddleBlock, VectorPH, Cmp));
-
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
   // PHIs that are left in the scalar version of the loop.
@@ -3831,7 +3807,7 @@ bool LoopVectorizationLegality::canVectorize() {
   }
 
   // We can only vectorize innermost loops.
-  if (!TheLoop->getSubLoopsVector().empty()) {
+  if (!TheLoop->empty()) {
     emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
     return false;
   }
@@ -3897,10 +3873,11 @@ bool LoopVectorizationLegality::canVectorize() {
   // Collect all of the variables that remain uniform after vectorization.
   collectLoopUniforms();
 
-  DEBUG(dbgs() << "LV: We can vectorize this loop" <<
-        (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" :
-         "")
-        <<"!\n");
+  DEBUG(dbgs() << "LV: We can vectorize this loop"
+               << (LAI->getRuntimePointerChecking()->Need
+                       ? " (with a runtime bound check)"
+                       : "")
+               << "!\n");
 
   // Analyze interleaved memory accesses.
   if (EnableInterleavedMemAccesses)
@@ -4130,118 +4107,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
   return true;
 }
 
-///\brief Remove GEPs whose indices but the last one are loop invariant and
-/// return the induction operand of the gep pointer.
-static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
-  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
-  if (!GEP)
-    return Ptr;
-
-  unsigned InductionOperand = getGEPInductionOperand(GEP);
-
-  // Check that all of the gep indices are uniform except for our induction
-  // operand.
-  for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
-    if (i != InductionOperand &&
-        !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
-      return Ptr;
-  return GEP->getOperand(InductionOperand);
-}
-
-///\brief Look for a cast use of the passed value.
-static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
-  Value *UniqueCast = nullptr;
-  for (User *U : Ptr->users()) {
-    CastInst *CI = dyn_cast<CastInst>(U);
-    if (CI && CI->getType() == Ty) {
-      if (!UniqueCast)
-        UniqueCast = CI;
-      else
-        return nullptr;
-    }
-  }
-  return UniqueCast;
-}
-
-///\brief Get the stride of a pointer access in a loop.
-/// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
-/// pointer to the Value, or null otherwise.
-static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
-  const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
-  if (!PtrTy || PtrTy->isAggregateType())
-    return nullptr;
-
-  // Try to remove a gep instruction to make the pointer (actually index at this
-  // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
-  // pointer, otherwise, we are analyzing the index.
-  Value *OrigPtr = Ptr;
-
-  // The size of the pointer access.
-  int64_t PtrAccessSize = 1;
-
-  Ptr = stripGetElementPtr(Ptr, SE, Lp);
-  const SCEV *V = SE->getSCEV(Ptr);
-
-  if (Ptr != OrigPtr)
-    // Strip off casts.
-    while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
-      V = C->getOperand();
-
-  const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
-  if (!S)
-    return nullptr;
-
-  V = S->getStepRecurrence(*SE);
-  if (!V)
-    return nullptr;
-
-  // Strip off the size of access multiplication if we are still analyzing the
-  // pointer.
-  if (OrigPtr == Ptr) {
-    const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout();
-    DL.getTypeAllocSize(PtrTy->getElementType());
-    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
-      if (M->getOperand(0)->getSCEVType() != scConstant)
-        return nullptr;
-
-      const APInt &APStepVal =
-          cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
-
-      // Huge step value - give up.
-      if (APStepVal.getBitWidth() > 64)
-        return nullptr;
-
-      int64_t StepVal = APStepVal.getSExtValue();
-      if (PtrAccessSize != StepVal)
-        return nullptr;
-      V = M->getOperand(1);
-    }
-  }
-
-  // Strip off casts.
-  Type *StripedOffRecurrenceCast = nullptr;
-  if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
-    StripedOffRecurrenceCast = C->getType();
-    V = C->getOperand();
-  }
-
-  // Look for the loop invariant symbolic value.
-  const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
-  if (!U)
-    return nullptr;
-
-  Value *Stride = U->getValue();
-  if (!Lp->isLoopInvariant(Stride))
-    return nullptr;
-
-  // If we have stripped off the recurrence cast we have to make sure that we
-  // return the value that is used in this loop so that we can replace it later.
-  if (StripedOffRecurrenceCast)
-    Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);
-
-  return Stride;
-}
-
 void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
   Value *Ptr = nullptr;
   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
@@ -4585,7 +4450,7 @@ LoopVectorizationCostModel::VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   // Width 1 means no vectorize
   VectorizationFactor Factor = { 1U, 0U };
-  if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
+  if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
     emitAnalysis(VectorizationReport() <<
                  "runtime pointer checks needed. Enable vectorization of this "
                  "loop with '#pragma clang loop vectorize(enable)' when "
@@ -4745,41 +4610,40 @@ unsigned LoopVectorizationCostModel::getWidestType() {
   return MaxWidth;
 }
 
-unsigned
-LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
-                                               unsigned VF,
-                                               unsigned LoopCost) {
+unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
+                                                           unsigned VF,
+                                                           unsigned LoopCost) {
 
-  // -- The unroll heuristics --
-  // We unroll the loop in order to expose ILP and reduce the loop overhead.
+  // -- The interleave heuristics --
+  // We interleave the loop in order to expose ILP and reduce the loop overhead.
   // There are many micro-architectural considerations that we can't predict
   // at this level. For example, frontend pressure (on decode or fetch) due to
   // code size, or the number and capabilities of the execution ports.
   //
-  // We use the following heuristics to select the unroll factor:
-  // 1. If the code has reductions, then we unroll in order to break the cross
+  // We use the following heuristics to select the interleave count:
+  // 1. If the code has reductions, then we interleave to break the cross
   // iteration dependency.
-  // 2. If the loop is really small, then we unroll in order to reduce the loop
+  // 2. If the loop is really small, then we interleave to reduce the loop
   // overhead.
-  // 3. We don't unroll if we think that we will spill registers to memory due
-  // to the increased register pressure.
+  // 3. We don't interleave if we think that we will spill registers to memory
+  // due to the increased register pressure.
 
   // Use the user preference, unless 'auto' is selected.
   int UserUF = Hints->getInterleave();
   if (UserUF != 0)
     return UserUF;
 
-  // When we optimize for size, we don't unroll.
+  // When we optimize for size, we don't interleave.
   if (OptForSize)
     return 1;
 
-  // We used the distance for the unroll factor.
+  // We used the distance for the interleave count.
   if (Legal->getMaxSafeDepDistBytes() != -1U)
     return 1;
 
-  // Do not unroll loops with a relatively small trip count.
+  // Do not interleave loops with a relatively small trip count.
   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
-  if (TC > 1 && TC < TinyTripCountUnrollThreshold)
+  if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
     return 1;
 
   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
@@ -4800,32 +4664,32 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
   R.NumInstructions = std::max(R.NumInstructions, 1U);
 
-  // We calculate the unroll factor using the following formula.
+  // We calculate the interleave count using the following formula.
   // Subtract the number of loop invariants from the number of available
-  // registers. These registers are used by all of the unrolled instances.
+  // registers. These registers are used by all of the interleaved instances.
   // Next, divide the remaining registers by the number of registers that is
   // required by the loop, in order to estimate how many parallel instances
   // fit without causing spills. All of this is rounded down if necessary to be
-  // a power of two. We want power of two unroll factors to simplify any
+  // a power of two. We want power of two interleave count to simplify any
   // addressing operations or alignment considerations.
-  unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
+  unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
                               R.MaxLocalUsers);
 
-  // Don't count the induction variable as unrolled.
+  // Don't count the induction variable as interleaved.
   if (EnableIndVarRegisterHeur)
-    UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
+    IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
                        std::max(1U, (R.MaxLocalUsers - 1)));
 
-  // Clamp the unroll factor ranges to reasonable factors.
-  unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(VF);
+  // Clamp the interleave ranges to reasonable counts.
+  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
 
-  // Check if the user has overridden the unroll max.
+  // Check if the user has overridden the max.
   if (VF == 1) {
     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
-      MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor;
+      MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
   } else {
     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
-      MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor;
+      MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
   }
 
   // If we did not calculate the cost for VF (because the user selected the VF)
@@ -4833,72 +4697,74 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   if (LoopCost == 0)
     LoopCost = expectedCost(VF);
 
-  // Clamp the calculated UF to be between the 1 and the max unroll factor
+  // Clamp the calculated IC to be between the 1 and the max interleave count
   // that the target allows.
-  if (UF > MaxInterleaveSize)
-    UF = MaxInterleaveSize;
-  else if (UF < 1)
-    UF = 1;
+  if (IC > MaxInterleaveCount)
+    IC = MaxInterleaveCount;
+  else if (IC < 1)
+    IC = 1;
 
-  // Unroll if we vectorized this loop and there is a reduction that could
-  // benefit from unrolling.
+  // Interleave if we vectorized this loop and there is a reduction that could
+  // benefit from interleaving.
   if (VF > 1 && Legal->getReductionVars()->size()) {
-    DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
-    return UF;
+    DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
+    return IC;
   }
 
   // Note that if we've already vectorized the loop we will have done the
-  // runtime check and so unrolling won't require further checks.
-  bool UnrollingRequiresRuntimePointerCheck =
-      (VF == 1 && Legal->getRuntimePointerCheck()->Need);
+  // runtime check and so interleaving won't require further checks.
+  bool InterleavingRequiresRuntimePointerCheck =
+      (VF == 1 && Legal->getRuntimePointerChecking()->Need);
 
-  // We want to unroll small loops in order to reduce the loop overhead and
+  // We want to interleave small loops in order to reduce the loop overhead and
   // potentially expose ILP opportunities.
   DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
-  if (!UnrollingRequiresRuntimePointerCheck &&
-      LoopCost < SmallLoopCost) {
+  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
     // We assume that the cost overhead is 1 and we use the cost model
-    // to estimate the cost of the loop and unroll until the cost of the
+    // to estimate the cost of the loop and interleave until the cost of the
     // loop overhead is about 5% of the cost of the loop.
-    unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+    unsigned SmallIC =
+        std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
 
-    // Unroll until store/load ports (estimated by max unroll factor) are
+    // Interleave until store/load ports (estimated by max interleave count) are
     // saturated.
     unsigned NumStores = Legal->getNumStores();
     unsigned NumLoads = Legal->getNumLoads();
-    unsigned StoresUF = UF / (NumStores ? NumStores : 1);
-    unsigned LoadsUF = UF /  (NumLoads ? NumLoads : 1);
+    unsigned StoresIC = IC / (NumStores ? NumStores : 1);
+    unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
 
     // If we have a scalar reduction (vector reductions are already dealt with
     // by this point), we can increase the critical path length if the loop
-    // we're unrolling is inside another loop. Limit, by default to 2, so the
+    // we're interleaving is inside another loop. Limit, by default to 2, so the
     // critical path only gets increased by one reduction operation.
     if (Legal->getReductionVars()->size() &&
         TheLoop->getLoopDepth() > 1) {
-      unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF);
-      SmallUF = std::min(SmallUF, F);
-      StoresUF = std::min(StoresUF, F);
-      LoadsUF = std::min(LoadsUF, F);
+      unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
+      SmallIC = std::min(SmallIC, F);
+      StoresIC = std::min(StoresIC, F);
+      LoadsIC = std::min(LoadsIC, F);
     }
 
-    if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
-      DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
-      return std::max(StoresUF, LoadsUF);
+    if (EnableLoadStoreRuntimeInterleave &&
+        std::max(StoresIC, LoadsIC) > SmallIC) {
+      DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
+      return std::max(StoresIC, LoadsIC);
     }
 
-    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
-    return SmallUF;
+    DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+    return SmallIC;
   }
 
-  // Unroll if this is a large loop (small loops are already dealt with by this
-  // point) that could benefit from interleaved unrolling.
+  // Interleave if this is a large loop (small loops are already dealt with by
+  // this
+  // point) that could benefit from interleaving.
   bool HasReductions = (Legal->getReductionVars()->size() > 0);
   if (TTI.enableAggressiveInterleaving(HasReductions)) {
-    DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n");
-    return UF;
+    DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+    return IC;
   }
 
-  DEBUG(dbgs() << "LV: Not Unrolling.\n");
+  DEBUG(dbgs() << "LV: Not Interleaving.\n");
   return 1;
 }
 
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7c4c279dcf4d..7bac407e77e9 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -69,8 +69,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore(
     cl::desc(
         "Attempt to vectorize horizontal reductions feeding into a store"));
 
+static cl::opt<int>
+MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+    cl::desc("Attempt to vectorize for this register size in bits"));
+
 namespace {
 
+// FIXME: Set this via cl::opt to allow overriding.
 static const unsigned MinVecRegSize = 128;
 
 static const unsigned RecursionMaxDepth = 12;
@@ -2136,9 +2141,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         }
 
         // Prepare the operand vector.
-        for (unsigned j = 0; j < E->Scalars.size(); ++j)
-          Operands.push_back(cast<PHINode>(E->Scalars[j])->
-                             getIncomingValueForBlock(IBB));
+        for (Value *V : E->Scalars)
+          Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
 
         Builder.SetInsertPoint(IBB->getTerminator());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
@@ -2172,8 +2176,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       ValueList INVL;
-      for (int i = 0, e = E->Scalars.size(); i < e; ++i)
-        INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+      for (Value *V : E->Scalars)
+        INVL.push_back(cast<Instruction>(V)->getOperand(0));
 
       setInsertPointAfterBundle(E->Scalars);
 
@@ -2191,9 +2195,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::FCmp:
     case Instruction::ICmp: {
       ValueList LHSV, RHSV;
-      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
-        LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
-        RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+      for (Value *V : E->Scalars) {
+        LHSV.push_back(cast<Instruction>(V)->getOperand(0));
+        RHSV.push_back(cast<Instruction>(V)->getOperand(1));
       }
 
       setInsertPointAfterBundle(E->Scalars);
@@ -2217,10 +2221,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     }
     case Instruction::Select: {
       ValueList TrueVec, FalseVec, CondVec;
-      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
-        CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
-        TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
-        FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
+      for (Value *V : E->Scalars) {
+        CondVec.push_back(cast<Instruction>(V)->getOperand(0));
+        TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
+        FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
       }
 
       setInsertPointAfterBundle(E->Scalars);
@@ -2259,9 +2263,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
         reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
       else
-        for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
-          LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
-          RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+        for (Value *V : E->Scalars) {
+          LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
+          RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
         }
 
       setInsertPointAfterBundle(E->Scalars);
@@ -2322,8 +2326,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       unsigned AS = SI->getPointerAddressSpace();
 
       ValueList ValueOp;
-      for (int i = 0, e = E->Scalars.size(); i < e; ++i)
-        ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
+      for (Value *V : E->Scalars)
+        ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
 
       setInsertPointAfterBundle(E->Scalars);
 
@@ -2351,8 +2355,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       setInsertPointAfterBundle(E->Scalars);
 
       ValueList Op0VL;
-      for (int i = 0, e = E->Scalars.size(); i < e; ++i)
-        Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
+      for (Value *V : E->Scalars)
+        Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
 
       Value *Op0 = vectorizeTree(Op0VL);
 
@@ -2360,8 +2364,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
            ++j) {
         ValueList OpVL;
-        for (int i = 0, e = E->Scalars.size(); i < e; ++i)
-          OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
+        for (Value *V : E->Scalars)
+          OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
 
         Value *OpVec = vectorizeTree(OpVL);
         OpVecs.push_back(OpVec);
@@ -2397,8 +2401,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
           OpVecs.push_back(CEI->getArgOperand(j));
           continue;
         }
-        for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
-          CallInst *CEI = cast<CallInst>(E->Scalars[i]);
+        for (Value *V : E->Scalars) {
+          CallInst *CEI = cast<CallInst>(V);
           OpVL.push_back(CEI->getArgOperand(j));
         }
 
@@ -3089,6 +3093,17 @@ struct SLPVectorizer : public FunctionPass {
     if (!TTI->getNumberOfRegisters(true))
       return false;
 
+    // Use the vector register size specified by the target unless overridden
+    // by a command-line option.
+    // TODO: It would be better to limit the vectorization factor based on
+    //       data type rather than just register size. For example, x86 AVX has
+    //       256-bit registers, but it does not support integer operations
+    //       at that width (that requires AVX2).
+    if (MaxVectorRegSizeOption.getNumOccurrences())
+      MaxVecRegSize = MaxVectorRegSizeOption;
+    else
+      MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
     // Don't vectorize when the attribute NoImplicitFloat is used.
     if (F.hasFnAttribute(Attribute::NoImplicitFloat))
       return false;
@@ -3166,12 +3181,13 @@ private:
   bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
 
   bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
-                           BoUpSLP &R);
+                           BoUpSLP &R, unsigned VecRegSize);
 
   bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
                        BoUpSLP &R);
 private:
   StoreListMap StoreRefs;
+  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
 };
 
 /// \brief Check that the Values in the slice in VL array are still existent in
@@ -3186,14 +3202,15 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
 }
 
 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
-                                          int CostThreshold, BoUpSLP &R) {
+                                        int CostThreshold, BoUpSLP &R,
+                                        unsigned VecRegSize) {
   unsigned ChainLen = Chain.size();
   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
         << "\n");
   Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
   auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
   unsigned Sz = DL.getTypeSizeInBits(StoreTy);
-  unsigned VF = MinVecRegSize / Sz;
+  unsigned VF = VecRegSize / Sz;
 
   if (!isPowerOf2_32(Sz) || VF < 2)
     return false;
@@ -3277,12 +3294,16 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
       I = ConsecutiveChain[I];
     }
 
-    bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
-
-    // Mark the vectorized stores so that we don't vectorize them again.
-    if (Vectorized)
-      VectorizedStores.insert(Operands.begin(), Operands.end());
-    Changed |= Vectorized;
+    // FIXME: Is division-by-2 the correct step? Should we assert that the
+    // register size is a power-of-2?
+    for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+      if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
+        // Mark the vectorized stores so that we don't vectorize them again.
+        VectorizedStores.insert(Operands.begin(), Operands.end());
+        Changed = true;
+        break;
+      }
+    }
   }
 
   return Changed;
@@ -3293,8 +3314,8 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
   unsigned count = 0;
   StoreRefs.clear();
   const DataLayout &DL = BB->getModule()->getDataLayout();
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    StoreInst *SI = dyn_cast<StoreInst>(it);
+  for (Instruction &I : *BB) {
+    StoreInst *SI = dyn_cast<StoreInst>(&I);
     if (!SI)
       continue;
 
@@ -3342,13 +3363,15 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
 
   Type *Ty0 = I0->getType();
   unsigned Sz = DL.getTypeSizeInBits(Ty0);
+  // FIXME: Register size should be a parameter to this function, so we can
+  // try different vectorization factors.
   unsigned VF = MinVecRegSize / Sz;
 
-  for (int i = 0, e = VL.size(); i < e; ++i) {
-    Type *Ty = VL[i]->getType();
+  for (Value *V : VL) {
+    Type *Ty = V->getType();
     if (!isValidElementType(Ty))
       return false;
-    Instruction *Inst = dyn_cast<Instruction>(VL[i]);
+    Instruction *Inst = dyn_cast<Instruction>(V);
     if (!Inst || Inst->getOpcode() != Opcode0)
       return false;
   }
@@ -3571,6 +3594,8 @@ public:
     const DataLayout &DL = B->getModule()->getDataLayout();
     ReductionOpcode = B->getOpcode();
     ReducedValueOpcode = 0;
+    // FIXME: Register size should be a parameter to this function, so we can
+    // try different vectorization factors.
     ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
     ReductionRoot = B;
     ReductionPHI = Phi;
@@ -3997,6 +4022,9 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
           << it->second.size() << ".\n");
 
     // Process the stores in chunks of 16.
+    // TODO: The limit of 16 inhibits greater vectorization factors.
+    //       For example, AVX2 supports v32i8. Increasing this limit, however,
+    //       may cause a significant compile-time increase.
     for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
       unsigned Len = std::min<unsigned>(CE - CI, 16);
       Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
diff --git a/test/Analysis/BasicAA/modref.ll b/test/Analysis/BasicAA/modref.ll
index e124d6cbe20f..3084f809c370 100644
--- a/test/Analysis/BasicAA/modref.ll
+++ b/test/Analysis/BasicAA/modref.ll
@@ -145,6 +145,51 @@ entry:
 ; CHECK: load i32, i32*
 }
 
+;; Check that aa correctly handles functions marked with argmemonly
+;; attribute.
+declare i32 @func_argmemonly(i32 * %P) argmemonly
+
+;; Can not remove redundant load, function may write to it.
+; CHECK-LABEL: @test8(
+define i32 @test8(i32 *%P) {
+  %V1 = load i32, i32* %P
+  call i32 @func_argmemonly(i32* %P)
+  %V2 = load i32, i32* %P
+  %Diff = sub i32 %V1, %V2
+  ret i32 %Diff
+  ; CHECK: load
+  ; CHECK: load
+  ; CHECK: sub
+  ; CHECK: ret i32 %Diff
+}
+
+;; In this case load can be removed, function clobbers only %P2.
+; CHECK-LABEL: @test9(
+define i32 @test9(i32* %P, i32* noalias %P2) {
+  %V1 = load i32, i32* %P
+  call i32 @func_argmemonly(i32* %P2)
+  %V2 = load i32, i32* %P
+  %Diff = sub i32 %V1, %V2
+  ret i32 %Diff
+  ; CHECK-NOT: load
+  ; CHECK: ret i32 0
+}
+
+;; In this case load can *not* be removed. Function clobers only %P2 but it may
+;; alias with %P.
+; CHECK-LABEL: @test10(
+define i32 @test10(i32* %P, i32* %P2) {
+  %V1 = load i32, i32* %P
+  call i32 @func_argmemonly(i32* %P2)
+  %V2 = load i32, i32* %P
+  %Diff = sub i32 %V1, %V2
+  ret i32 %Diff
+  ; CHECK: load
+  ; CHECK: load
+  ; CHECK: sub
+  ; CHECK: ret i32 %Diff
+}
+
 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
 declare void @llvm.memset.p0i8.i8(i8* nocapture, i8, i8, i32, i1) nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i8(i8* nocapture, i8* nocapture, i8, i32, i1) nounwind
diff --git a/test/Analysis/CostModel/X86/testshiftashr.ll b/test/Analysis/CostModel/X86/testshiftashr.ll
index ced2ffed4552..da4e7d466e2b 100644
--- a/test/Analysis/CostModel/X86/testshiftashr.ll
+++ b/test/Analysis/CostModel/X86/testshiftashr.ll
@@ -17,9 +17,9 @@ entry:
 define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
 entry:
   ; SSE2: shift4i16
-  ; SSE2: cost of 40 {{.*}} ashr
+  ; SSE2: cost of 16 {{.*}} ashr
   ; SSE2-CODEGEN: shift4i16
-  ; SSE2-CODEGEN: sarl %cl
+  ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype4i16 %a , %b
   ret %shifttype4i16 %0
@@ -77,9 +77,9 @@ entry:
 define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
 entry:
   ; SSE2: shift4i32
-  ; SSE2: cost of 40 {{.*}} ashr
+  ; SSE2: cost of 16 {{.*}} ashr
   ; SSE2-CODEGEN: shift4i32
-  ; SSE2-CODEGEN: sarl %cl
+  ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype4i32 %a , %b
   ret %shifttype4i32 %0
@@ -89,9 +89,9 @@ entry:
 define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
 entry:
   ; SSE2: shift8i32
-  ; SSE2: cost of 80 {{.*}} ashr
+  ; SSE2: cost of 32 {{.*}} ashr
   ; SSE2-CODEGEN: shift8i32
-  ; SSE2-CODEGEN: sarl %cl
+  ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype8i32 %a , %b
   ret %shifttype8i32 %0
@@ -101,9 +101,9 @@ entry:
 define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
 entry:
   ; SSE2: shift16i32
-  ; SSE2: cost of 160 {{.*}} ashr
+  ; SSE2: cost of 64 {{.*}} ashr
   ; SSE2-CODEGEN: shift16i32
-  ; SSE2-CODEGEN: sarl %cl
+  ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype16i32 %a , %b
   ret %shifttype16i32 %0
@@ -113,9 +113,9 @@ entry:
 define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
 entry:
   ; SSE2: shift32i32
-  ; SSE2: cost of 320 {{.*}} ashr
+  ; SSE2: cost of 128 {{.*}} ashr
   ; SSE2-CODEGEN: shift32i32
-  ; SSE2-CODEGEN: sarl %cl
+  ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype32i32 %a , %b
   ret %shifttype32i32 %0
@@ -197,9 +197,9 @@ entry:
 define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
 entry:
   ; SSE2: shift4i8
-  ; SSE2: cost of 40 {{.*}} ashr
+  ; SSE2: cost of 16 {{.*}} ashr
   ; SSE2-CODEGEN: shift4i8
-  ; SSE2-CODEGEN: sarl %cl
+  ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype4i8 %a , %b
   ret %shifttype4i8 %0
@@ -247,9 +247,9 @@ entry:
 define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
 entry:
   ; SSE2: shift2i16const
-  ; SSE2: cost of 20 {{.*}} ashr
+  ; SSE2: cost of 4 {{.*}} ashr
   ; SSE2-CODEGEN: shift2i16const
-  ; SSE2-CODEGEN: sarq $
+  ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec %a , <i16 3, i16 3>
   ret %shifttypec %0
@@ -320,9 +320,9 @@ entry:
 define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
 entry:
   ; SSE2: shift2i32c
-  ; SSE2: cost of 20 {{.*}} ashr
+  ; SSE2: cost of 4 {{.*}} ashr
   ; SSE2-CODEGEN: shift2i32c
-  ; SSE2-CODEGEN: sarq $3
+  ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec2i32 %a , <i32 3, i32 3>
   ret %shifttypec2i32 %0
@@ -391,9 +391,9 @@ entry:
 define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
 entry:
   ; SSE2: shift2i64c
-  ; SSE2: cost of 20 {{.*}} ashr
+  ; SSE2: cost of 4 {{.*}} ashr
   ; SSE2-CODEGEN: shift2i64c
-  ; SSE2-CODEGEN: sarq $3
+  ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec2i64 %a , <i64 3, i64 3>
   ret %shifttypec2i64 %0
@@ -403,9 +403,9 @@ entry:
 define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
 entry:
   ; SSE2: shift4i64c
-  ; SSE2: cost of 40 {{.*}} ashr
+  ; SSE2: cost of 8 {{.*}} ashr
   ; SSE2-CODEGEN: shift4i64c
-  ; SSE2-CODEGEN: sarq $3
+  ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
   ret %shifttypec4i64 %0
@@ -415,9 +415,9 @@ entry:
 define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
 entry:
   ; SSE2: shift8i64c
-  ; SSE2: cost of 80 {{.*}} ashr
+  ; SSE2: cost of 16 {{.*}} ashr
   ; SSE2-CODEGEN: shift8i64c
-  ; SSE2-CODEGEN: sarq $3
+  ; SSE2-CODEGEN: psrad $3
 
  %0 = ashr %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
                                  i64 3, i64 3, i64 3, i64 3>
@@ -428,9 +428,9 @@ entry:
 define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
 entry:
   ; SSE2: shift16i64c
-  ; SSE2: cost of 160 {{.*}} ashr
+  ; SSE2: cost of 32 {{.*}} ashr
   ; SSE2-CODEGEN: shift16i64c
-  ; SSE2-CODEGEN: sarq $3
+  ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
                                    i64 3, i64 3, i64 3, i64 3,
@@ -443,9 +443,9 @@ entry:
 define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
 entry:
   ; SSE2: shift32i64c
-  ; SSE2: cost of 320 {{.*}} ashr
+  ; SSE2: cost of 64 {{.*}} ashr
   ; SSE2-CODEGEN: shift32i64c
-  ; SSE2-CODEGEN: sarq $3
+  ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
                                   i64 3, i64 3, i64 3, i64 3,
@@ -462,9 +462,9 @@ entry:
 define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
 entry:
   ; SSE2: shift2i8c
-  ; SSE2: cost of 20 {{.*}} ashr
+  ; SSE2: cost of 4 {{.*}} ashr
   ; SSE2-CODEGEN: shift2i8c
-  ; SSE2-CODEGEN: sarq $3
+  ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec2i8 %a , <i8 3, i8 3>
   ret %shifttypec2i8 %0
diff --git a/test/Analysis/CostModel/X86/testshiftlshr.ll b/test/Analysis/CostModel/X86/testshiftlshr.ll
index 0bc60eacac9a..5775a42d08ad 100644
--- a/test/Analysis/CostModel/X86/testshiftlshr.ll
+++ b/test/Analysis/CostModel/X86/testshiftlshr.ll
@@ -17,9 +17,9 @@ entry:
 define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
 entry:
   ; SSE2: shift4i16
-  ; SSE2: cost of 40 {{.*}} lshr
+  ; SSE2: cost of 16 {{.*}} lshr
   ; SSE2-CODEGEN: shift4i16
-  ; SSE2-CODEGEN: shrl %cl
+  ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype4i16 %a , %b
   ret %shifttype4i16 %0
@@ -77,9 +77,9 @@ entry:
 define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
 entry:
   ; SSE2: shift4i32
-  ; SSE2: cost of 40 {{.*}} lshr
+  ; SSE2: cost of 16 {{.*}} lshr
   ; SSE2-CODEGEN: shift4i32
-  ; SSE2-CODEGEN: shrl %cl
+  ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype4i32 %a , %b
   ret %shifttype4i32 %0
@@ -89,9 +89,9 @@ entry:
 define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
 entry:
   ; SSE2: shift8i32
-  ; SSE2: cost of 80 {{.*}} lshr
+  ; SSE2: cost of 32 {{.*}} lshr
   ; SSE2-CODEGEN: shift8i32
-  ; SSE2-CODEGEN: shrl %cl
+  ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype8i32 %a , %b
   ret %shifttype8i32 %0
@@ -101,9 +101,9 @@ entry:
 define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
 entry:
   ; SSE2: shift16i32
-  ; SSE2: cost of 160 {{.*}} lshr
+  ; SSE2: cost of 64 {{.*}} lshr
   ; SSE2-CODEGEN: shift16i32
-  ; SSE2-CODEGEN: shrl %cl
+  ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype16i32 %a , %b
   ret %shifttype16i32 %0
@@ -113,9 +113,9 @@ entry:
 define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
 entry:
   ; SSE2: shift32i32
-  ; SSE2: cost of 320 {{.*}} lshr
+  ; SSE2: cost of 128 {{.*}} lshr
   ; SSE2-CODEGEN: shift32i32
-  ; SSE2-CODEGEN: shrl %cl
+  ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype32i32 %a , %b
   ret %shifttype32i32 %0
@@ -197,9 +197,9 @@ entry:
 define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
 entry:
   ; SSE2: shift4i8
-  ; SSE2: cost of 40 {{.*}} lshr
+  ; SSE2: cost of 16 {{.*}} lshr
   ; SSE2-CODEGEN: shift4i8
-  ; SSE2-CODEGEN: shrl %cl
+  ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype4i8 %a , %b
   ret %shifttype4i8 %0
diff --git a/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll b/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
index f9871c643c9d..50b37a031a60 100644
--- a/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
+++ b/test/Analysis/LoopAccessAnalysis/number-of-memchecks.ll
@@ -1,19 +1,20 @@
 ; RUN: opt -loop-accesses -analyze < %s | FileCheck %s
 
-; 3 reads and 3 writes should need 12 memchecks
-
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnueabi"
 
+; 3 reads and 3 writes should need 12 memchecks
+; CHECK: function 'testf':
 ; CHECK: Memory dependences are safe with run-time checks
-; Memory dependecies have labels starting from 0, so in
+
+; Memory dependencies have labels starting from 0, so in
 ; order to verify that we have n checks, we look for
 ; (n-1): and not n:.
 
 ; CHECK: Run-time memory checks:
-; CHECK-NEXT: 0:
-; CHECK: 11:
-; CHECK-NOT: 12:
+; CHECK-NEXT: Check 0:
+; CHECK: Check 11:
+; CHECK-NOT: Check 12:
 
 define void @testf(i16* %a,
                i16* %b,
@@ -56,3 +57,226 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+; The following (testg and testh) check that we can group
+; memory checks of accesses which differ by a constant value.
+; Both tests are based on the following C code:
+;
+; void testh(short *a, short *b, short *c) {
+;   unsigned long ind = 0;
+;   for (unsigned long ind = 0; ind < 20; ++ind) {
+;     c[2 * ind] = a[ind] * a[ind + 1];
+;     c[2 * ind + 1] = a[ind] * a[ind + 1] * b[ind];
+;   }
+; }
+;
+; It is sufficient to check the intervals
+; [a, a + 21], [b, b + 20] against [c, c + 41].
+
+; 3 reads and 2 writes - two of the reads can be merged,
+; and the writes can be merged as well. This gives us a
+; total of 2 memory checks.
+
+; CHECK: function 'testg':
+
+; CHECK: Run-time memory checks:
+; CHECK-NEXT:   Check 0:
+; CHECK-NEXT:     Comparing group 0:
+; CHECK-NEXT:       %arrayidxC1 = getelementptr inbounds i16, i16* %c, i64 %store_ind_inc
+; CHECK-NEXT:       %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %store_ind
+; CHECK-NEXT:     Against group 1:
+; CHECK-NEXT:       %arrayidxA1 = getelementptr inbounds i16, i16* %a, i64 %add
+; CHECK-NEXT:       %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %ind
+; CHECK-NEXT:   Check 1:
+; CHECK-NEXT:     Comparing group 0:
+; CHECK-NEXT:       %arrayidxC1 = getelementptr inbounds i16, i16* %c, i64 %store_ind_inc
+; CHECK-NEXT:       %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %store_ind
+; CHECK-NEXT:     Against group 2:
+; CHECK-NEXT:       %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %ind
+; CHECK-NEXT:   Grouped accesses:
+; CHECK-NEXT:    Group 0:
+; CHECK-NEXT:       (Low: %c High: (78 + %c))
+; CHECK-NEXT:         Member: {(2 + %c),+,4}
+; CHECK-NEXT:         Member: {%c,+,4}
+; CHECK-NEXT:     Group 1:
+; CHECK-NEXT:       (Low: %a High: (40 + %a))
+; CHECK-NEXT:         Member: {(2 + %a),+,2}
+; CHECK-NEXT:         Member: {%a,+,2}
+; CHECK-NEXT:     Group 2:
+; CHECK-NEXT:       (Low: %b High: (38 + %b))
+; CHECK-NEXT:         Member: {%b,+,2}
+
+define void @testg(i16* %a,
+               i16* %b,
+               i16* %c) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %store_ind = phi i64 [ 0, %entry ], [ %store_ind_next, %for.body ]
+
+  %add = add nuw nsw i64 %ind, 1
+  %store_ind_inc = add nuw nsw i64 %store_ind, 1
+  %store_ind_next = add nuw nsw i64 %store_ind_inc, 1
+
+  %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %ind
+  %loadA = load i16, i16* %arrayidxA, align 2
+
+  %arrayidxA1 = getelementptr inbounds i16, i16* %a, i64 %add
+  %loadA1 = load i16, i16* %arrayidxA1, align 2
+
+  %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %ind
+  %loadB = load i16, i16* %arrayidxB, align 2
+
+  %mul = mul i16 %loadA, %loadA1
+  %mul1 = mul i16 %mul, %loadB
+
+  %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %store_ind
+  store i16 %mul1, i16* %arrayidxC, align 2
+
+  %arrayidxC1 = getelementptr inbounds i16, i16* %c, i64 %store_ind_inc
+  store i16 %mul, i16* %arrayidxC1, align 2
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; 3 reads and 2 writes - the writes can be merged into a single
+; group, but the GEPs used for the reads are not marked as inbounds.
+; We can still merge them because we are using a unit stride for
+; accesses, so we cannot overflow the GEPs.
+
+; CHECK: function 'testh':
+; CHECK: Run-time memory checks:
+; CHECK-NEXT:   Check 0:
+; CHECK-NEXT:     Comparing group 0:
+; CHECK-NEXT:         %arrayidxC1 = getelementptr inbounds i16, i16* %c, i64 %store_ind_inc
+; CHECK-NEXT:         %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %store_ind
+; CHECK-NEXT:     Against group 1:
+; CHECK-NEXT:         %arrayidxA1 = getelementptr i16, i16* %a, i64 %add
+; CHECK-NEXT:         %arrayidxA = getelementptr i16, i16* %a, i64 %ind
+; CHECK-NEXT:   Check 1:
+; CHECK-NEXT:     Comparing group 0:
+; CHECK-NEXT:         %arrayidxC1 = getelementptr inbounds i16, i16* %c, i64 %store_ind_inc
+; CHECK-NEXT:         %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %store_ind
+; CHECK-NEXT:     Against group 2:
+; CHECK-NEXT:         %arrayidxB = getelementptr i16, i16* %b, i64 %ind
+; CHECK-NEXT:   Grouped accesses:
+; CHECK-NEXT:     Group 0:
+; CHECK-NEXT:       (Low: %c High: (78 + %c))
+; CHECK-NEXT:         Member: {(2 + %c),+,4}
+; CHECK-NEXT:         Member: {%c,+,4}
+; CHECK-NEXT:     Group 1:
+; CHECK-NEXT:       (Low: %a High: (40 + %a))
+; CHECK-NEXT:         Member: {(2 + %a),+,2}
+; CHECK-NEXT:         Member: {%a,+,2}
+; CHECK-NEXT:     Group 2:
+; CHECK-NEXT:       (Low: %b High: (38 + %b))
+; CHECK-NEXT:         Member: {%b,+,2}
+
+define void @testh(i16* %a,
+               i16* %b,
+               i16* %c) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %store_ind = phi i64 [ 0, %entry ], [ %store_ind_next, %for.body ]
+
+  %add = add nuw nsw i64 %ind, 1
+  %store_ind_inc = add nuw nsw i64 %store_ind, 1
+  %store_ind_next = add nuw nsw i64 %store_ind_inc, 1
+
+  %arrayidxA = getelementptr i16, i16* %a, i64 %ind
+  %loadA = load i16, i16* %arrayidxA, align 2
+
+  %arrayidxA1 = getelementptr i16, i16* %a, i64 %add
+  %loadA1 = load i16, i16* %arrayidxA1, align 2
+
+  %arrayidxB = getelementptr i16, i16* %b, i64 %ind
+  %loadB = load i16, i16* %arrayidxB, align 2
+
+  %mul = mul i16 %loadA, %loadA1
+  %mul1 = mul i16 %mul, %loadB
+
+  %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %store_ind
+  store i16 %mul1, i16* %arrayidxC, align 2
+
+  %arrayidxC1 = getelementptr inbounds i16, i16* %c, i64 %store_ind_inc
+  store i16 %mul, i16* %arrayidxC1, align 2
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Don't merge pointers if there is some other check which could be falsely
+; invalidated. For example, in the following loop:
+;
+; for (i = 0; i < 5000; ++i)
+;   a[i + offset] = a[i] + a[i + 10000]
+;
+; we should not merge the intervals associated with the reads (0,5000) and
+; (10000, 15000) into (0, 15000) as this will pottentially fail the check
+; against the interval associated with the write.
+
+; CHECK: function 'testi':
+; CHECK: Run-time memory checks:
+; CHECK-NEXT:   Check 0:
+; CHECK-NEXT:     Comparing group 0:
+; CHECK-NEXT:       %storeidx = getelementptr inbounds i16, i16* %a, i64 %store_ind
+; CHECK-NEXT:     Against group 1:
+; CHECK-NEXT:       %arrayidxA1 = getelementptr i16, i16* %a, i64 %ind
+; CHECK-NEXT:   Check 1:
+; CHECK-NEXT:     Comparing group 0:
+; CHECK-NEXT:       %storeidx = getelementptr inbounds i16, i16* %a, i64 %store_ind
+; CHECK-NEXT:     Against group 2:
+; CHECK-NEXT:       %arrayidxA2 = getelementptr i16, i16* %a, i64 %ind2
+; CHECK-NEXT:   Grouped accesses:
+; CHECK-NEXT:     Group 0:
+; CHECK-NEXT:       (Low: ((2 * %offset) + %a) High: (9998 + (2 * %offset) + %a))
+; CHECK-NEXT:         Member: {((2 * %offset) + %a),+,2}<nsw><%for.body>
+; CHECK-NEXT:     Group 1:
+; CHECK-NEXT:       (Low: %a High: (9998 + %a))
+; CHECK-NEXT:         Member: {%a,+,2}<%for.body>
+; CHECK-NEXT:     Group 2:
+; CHECK-NEXT:       (Low: (20000 + %a) High: (29998 + %a))
+; CHECK-NEXT:         Member: {(20000 + %a),+,2}<%for.body>
+
+define void @testi(i16* %a,
+                   i64 %offset) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %store_ind = phi i64 [ %offset, %entry ], [ %store_ind_inc, %for.body ]
+
+  %add = add nuw nsw i64 %ind, 1
+  %store_ind_inc = add nuw nsw i64 %store_ind, 1
+
+  %arrayidxA1 = getelementptr i16, i16* %a, i64 %ind
+  %ind2 = add nuw nsw i64 %ind, 10000
+  %arrayidxA2 = getelementptr i16, i16* %a, i64 %ind2
+
+  %loadA1 = load i16, i16* %arrayidxA1, align 2
+  %loadA2 = load i16, i16* %arrayidxA2, align 2
+
+  %addres = add i16 %loadA1, %loadA2
+
+  %storeidx = getelementptr inbounds i16, i16* %a, i64 %store_ind
+  store i16 %addres, i16* %storeidx, align 2
+
+  %exitcond = icmp eq i64 %add, 5000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll b/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll
new file mode 100644
index 000000000000..d05849e2be2d
--- /dev/null
+++ b/test/Analysis/LoopAccessAnalysis/pointer-with-unknown-bounds.ll
@@ -0,0 +1,42 @@
+; RUN: opt -loop-accesses -analyze < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; We shouldn't quit the analysis if we encounter a pointer without known
+; bounds *unless* we actually need to emit a memcheck for it.  (We only
+; compute bounds for SCEVAddRecs so A[i*I] is deemed not having known bounds.)
+;
+; for (i = 0; i < 20; ++i)
+;   A[i*i] *= 2;
+
+; CHECK: for.body:
+; CHECK:     Report: unsafe dependent memory operations in loop
+; CHECK-NOT: Report: cannot identify array bounds
+; CHECK:     Interesting Dependences:
+; CHECK:       Unknown:
+; CHECK:           %loadA = load i16, i16* %arrayidxA, align 2 ->
+; CHECK:           store i16 %mul, i16* %arrayidxA, align 2
+
+define void @f(i16* %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %access_ind = mul i64 %ind, %ind
+
+  %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %access_ind
+  %loadA = load i16, i16* %arrayidxA, align 2
+
+  %mul = mul i16 %loadA, 2
+
+  store i16 %mul, i16* %arrayidxA, align 2
+
+  %add = add nuw nsw i64 %ind, 1
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/LoopAccessAnalysis/resort-to-memchecks-only.ll b/test/Analysis/LoopAccessAnalysis/resort-to-memchecks-only.ll
index 64f7729fa18a..e7305173dd95 100644
--- a/test/Analysis/LoopAccessAnalysis/resort-to-memchecks-only.ll
+++ b/test/Analysis/LoopAccessAnalysis/resort-to-memchecks-only.ll
@@ -15,7 +15,9 @@ target triple = "x86_64-apple-macosx10.10.0"
 ; CHECK-NEXT: Interesting Dependences:
 ; CHECK-NEXT: Run-time memory checks:
 ; CHECK-NEXT: 0:
+; CHECK-NEXT: Comparing group
 ; CHECK-NEXT:   %arrayidxA2 = getelementptr inbounds i16, i16* %a, i64 %idx
+; CHECK-NEXT: Against group
 ; CHECK-NEXT:   %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %indvar
 
 @B = common global i16* null, align 8
diff --git a/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks.ll b/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks.ll
index ce8b86ba2c51..237cbc8b9873 100644
--- a/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks.ll
+++ b/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks.ll
@@ -14,10 +14,16 @@ target triple = "x86_64-apple-macosx10.10.0"
 ; CHECK-NEXT:     store i16 %mul1, i16* %arrayidxA_plus_2, align 2
 ; CHECK: Run-time memory checks:
 ; CHECK-NEXT: 0:
+; CHECK-NEXT: Comparing group
+; CHECK-NEXT:   %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
 ; CHECK-NEXT:   %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+; CHECK-NEXT: Against group
 ; CHECK-NEXT:   %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3
 ; CHECK-NEXT: 1:
+; CHECK-NEXT: Comparing group
+; CHECK-NEXT:   %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
 ; CHECK-NEXT:   %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+; CHECK-NEXT: Against group
 ; CHECK-NEXT:   %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3
 
 @B = common global i16* null, align 8
diff --git a/test/Assembler/getelementptr_vec_idx1.ll b/test/Assembler/getelementptr_vec_idx1.ll
index 084a31e7234e..12160574d4e5 100644
--- a/test/Assembler/getelementptr_vec_idx1.ll
+++ b/test/Assembler/getelementptr_vec_idx1.ll
@@ -1,8 +1,8 @@
 ; RUN: not llvm-as < %s >/dev/null 2> %t
 ; RUN: FileCheck %s < %t
-; Test that a vector index is only used with a vector pointer.
+; Test that a vector GEP may be used with a scalar base, the result is a vector of pointers
 
-; CHECK: getelementptr index type missmatch
+; CHECK: '%w' defined with type '<2 x i32*>
 
 define i32 @test(i32* %a) {
   %w = getelementptr i32, i32* %a, <2 x i32> <i32 5, i32 9>
diff --git a/test/Assembler/getelementptr_vec_idx2.ll b/test/Assembler/getelementptr_vec_idx2.ll
index 638fcb8b67dd..be294098c9eb 100644
--- a/test/Assembler/getelementptr_vec_idx2.ll
+++ b/test/Assembler/getelementptr_vec_idx2.ll
@@ -1,10 +1,24 @@
 ; RUN: not llvm-as < %s >/dev/null 2> %t
 ; RUN: FileCheck %s < %t
-; Test that a vector pointer is only used with a vector index.
+; Test that a vector pointer may be used with a scalar index.
+; Test that a vector pointer and vector index should have the same vector width
 
-; CHECK: getelementptr index type missmatch
-
-define <2 x i32> @test(<2 x i32*> %a) {
+; This code is correct
+define <2 x i32*> @test2(<2 x i32*> %a) {
   %w = getelementptr i32, <2 x i32*> %a, i32 2
+  ret <2 x i32*> %w
+}
+
+; This code is correct
+define <2 x i32*> @test3(i32* %a) {
+  %w = getelementptr i32, i32* %a, <2 x i32> <i32 2, i32 2>
+  ret <2 x i32*> %w
+}
+
+; CHECK: getelementptr vector index has a wrong number of elements
+
+define <2 x i32> @test1(<2 x i32*> %a) {
+  %w = getelementptr i32, <2 x i32*> %a, <4 x i32><i32 2, i32 2, i32 2, i32 2>
   ret <2 x i32> %w
 }
+
diff --git a/test/Assembler/getelementptr_vec_idx3.ll b/test/Assembler/getelementptr_vec_idx3.ll
index ac94459e23d4..767c817cc629 100644
--- a/test/Assembler/getelementptr_vec_idx3.ll
+++ b/test/Assembler/getelementptr_vec_idx3.ll
@@ -1,8 +1,8 @@
 ; RUN: not llvm-as < %s >/dev/null 2> %t
 ; RUN: FileCheck %s < %t
-; Test that vector indices have the same number of elements as the pointer.
+; Test that a vector GEP may be used with a scalar base, the result is a vector of pointers
 
-; CHECK: getelementptr index type missmatch
+; CHECK: '%w' defined with type '<2 x <4 x i32>*>'
 
 define <4 x i32> @test(<4 x i32>* %a) {
   %w = getelementptr <4 x i32>, <4 x i32>* %a, <2 x i32> <i32 5, i32 9>
diff --git a/test/Bitcode/attributes.ll b/test/Bitcode/attributes.ll
index cae6a2e01e6f..a0bc66642f7d 100644
--- a/test/Bitcode/attributes.ll
+++ b/test/Bitcode/attributes.ll
@@ -204,7 +204,7 @@ define void @f34()
 ; CHECK: define void @f34()
 {
         call void @nobuiltin() nobuiltin
-; CHECK: call void @nobuiltin() #26
+; CHECK: call void @nobuiltin() #27
         ret void;
 }
 
@@ -256,6 +256,12 @@ define void @f43() convergent {
   ret void
 }
 
+define void @f44() argmemonly
+; CHECK: define void @f44() #26
+{
+        ret void;
+}
+
 ; CHECK: attributes #0 = { noreturn }
 ; CHECK: attributes #1 = { nounwind }
 ; CHECK: attributes #2 = { readnone }
@@ -282,4 +288,5 @@ define void @f43() convergent {
 ; CHECK: attributes #23 = { noinline optnone }
 ; CHECK: attributes #24 = { jumptable }
 ; CHECK: attributes #25 = { convergent }
-; CHECK: attributes #26 = { nobuiltin }
+; CHECK: attributes #26 = { argmemonly }
+; CHECK: attributes #27 = { nobuiltin }
diff --git a/test/Bitcode/fcmp-fast.ll b/test/Bitcode/fcmp-fast.ll
new file mode 100644
index 000000000000..126e3652e3ac
--- /dev/null
+++ b/test/Bitcode/fcmp-fast.ll
@@ -0,0 +1,23 @@
+; RUN: llvm-as < %s | llvm-dis > %t0
+; RUN: opt -S < %s > %t1
+; RUN: diff %t0 %t1
+; RUN: FileCheck < %t1 %s
+
+; Make sure flags on fcmp instructions are serialized/deserialized properly.
+
+define i1 @foo(float %a, float %b, double %c, double %d) {
+  ; CHECK:   %plain = fcmp ueq float %a, %b
+  %plain = fcmp ueq float %a, %b
+  ; CHECK:   %fast = fcmp fast olt float %a, %b
+  %fast = fcmp fast olt float %a, %b
+  ; CHECK:   %nsz = fcmp nsz uge float %a, %b
+  %nsz = fcmp nsz uge float %a, %b
+  ; CHECK:   %nnan = fcmp nnan nsz oge double %c, %d
+  %nnan = fcmp nnan nsz oge double %c, %d
+
+  %dce1 = or i1 %plain, %fast
+  %dce2 = or i1 %dce1, %nsz
+  %dce3 = or i1 %dce2, %nnan
+
+  ret i1 %dce3
+}
diff --git a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index a31c66bad4be..739570236da9 100644
--- a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -255,7 +255,7 @@ entry:
 ; CHECK: ubfx	x9, x0, #0, #32
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
-; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: and	x9, x9, #0x7fffffff0
 ; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
@@ -302,7 +302,7 @@ entry:
 ; CHECK: ubfx	x9, x0, #0, #32
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
-; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: and	x9, x9, #0x7fffffff0
 ; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
@@ -364,7 +364,7 @@ entry:
 ; CHECK: ubfx	x9, x0, #0, #32
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
-; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: and	x9, x9, #0x7fffffff0
 ; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
@@ -417,7 +417,7 @@ entry:
 ; CHECK: ubfx	x9, x0, #0, #32
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
-; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: and	x9, x9, #0x7fffffff0
 ; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
@@ -468,7 +468,7 @@ entry:
 ; CHECK: ubfx	x9, x0, #0, #32
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
-; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: and	x9, x9, #0x7fffffff0
 ; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
@@ -482,6 +482,56 @@ entry:
 ; CHECK: ldp	x20, x19, [sp], #32
 ; CHECK: ret
 
+
+define void @realign_conditional(i1 %b) {
+entry:
+  br i1 %b, label %bb0, label %bb1
+
+bb0:
+  %MyAlloca = alloca i8, i64 64, align 32
+  br label %bb1
+
+bb1:
+  ret void
+}
+
+; CHECK-LABEL: realign_conditional
+; No realignment in the prologue.
+; CHECK-NOT:  and
+; CHECK-NOT:  0xffffffffffffffe0
+; CHECK:  tbz  {{.*}} .[[LABEL:.*]]
+; Stack is realigned in a non-entry BB.
+; CHECK:  sub  [[REG:x[01-9]+]], sp, #64
+; CHECK:  and  sp, [[REG]], #0xffffffffffffffe0
+; CHECK:  .[[LABEL]]:
+; CHECK:  ret
+
+
+define void @realign_conditional2(i1 %b) {
+entry:
+  %tmp = alloca i8, i32 4
+  br i1 %b, label %bb0, label %bb1
+
+bb0:
+  %MyAlloca = alloca i8, i64 64, align 32
+  br label %bb1
+
+bb1:
+  ret void
+}
+
+; CHECK-LABEL: realign_conditional2
+; Extra realignment in the prologue (performance issue).
+; CHECK:  sub  x9, sp, #32            // =32
+; CHECK:  and  sp, x9, #0xffffffffffffffe0
+; CHECK:  mov   x19, sp
+; CHECK:  tbz  {{.*}} .[[LABEL:.*]]
+; Stack is realigned in a non-entry BB.
+; CHECK:  sub  [[REG:x[01-9]+]], sp, #64
+; CHECK:  and  sp, [[REG]], #0xffffffffffffffe0
+; CHECK:  .[[LABEL]]:
+; CHECK:  ret
+
 attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
diff --git a/test/CodeGen/AArch64/arm64-nvcast.ll b/test/CodeGen/AArch64/arm64-nvcast.ll
new file mode 100644
index 000000000000..3cb1bf25fc34
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-nvcast.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+; CHECK-LABEL: _test:
+; CHECK:  fmov.2d v0, #2.00000000
+; CHECK:  str  q0, [sp]
+; CHECK:  mov  x8, sp
+; CHECK:  ldr s0, [x8, w1, sxtw #2]
+; CHECK:  str  s0, [x0]
+
+define void @test(float * %p1, i32 %v1) {
+entry:
+  %v2 = extractelement <3 x float> <float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, i32 %v1
+  store float %v2, float* %p1, align 4
+  ret void
+}
+
+; CHECK-LABEL: _test2
+; CHECK: movi.16b  v0, #0x3f
+; CHECK: str  q0, [sp]
+; CHECK: mov  x8, sp
+; CHECK: ldr s0, [x8, w1, sxtw #2]
+; CHECK: str  s0, [x0]
+
+define void @test2(float * %p1, i32 %v1) {
+entry:
+  %v2 = extractelement <3 x float> <float 0.7470588088035583, float 0.7470588088035583, float 0.7470588088035583>, i32 %v1
+  store float %v2, float* %p1, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
index c1777513fa04..599712be401c 100644
--- a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -500,3 +500,42 @@ if.end:                                           ; preds = %if.else, %if.then
 }
 
 declare i32 @someVariadicFunc(i32, ...)
+
+; Make sure we do not insert unreachable code after noreturn function.
+; Although this is not incorrect to insert such code, it is useless
+; and it hurts the binary size.
+;
+; CHECK-LABEL: noreturn:
+; DISABLE: stp
+;
+; CHECK: and [[TEST:w[0-9]+]], w0, #0xff
+; CHECK-NEXT: cbnz [[TEST]], [[ABORT:LBB[0-9_]+]]
+;
+; CHECK: movz w0, #0x2a
+;
+; DISABLE-NEXT: ldp
+;
+; CHECK-NEXT: ret
+;
+; CHECK: [[ABORT]]: ; %if.abort
+;
+; ENABLE: stp
+;
+; CHECK: bl _abort
+; ENABLE-NOT: ldp
+define i32 @noreturn(i8 signext %bad_thing) {
+entry:
+  %tobool = icmp eq i8 %bad_thing, 0
+  br i1 %tobool, label %if.end, label %if.abort
+
+if.abort:
+  tail call void @abort() #0
+  unreachable
+
+if.end:
+  ret i32 42
+}
+
+declare void @abort() #0
+
+attributes #0 = { noreturn nounwind }
diff --git a/test/CodeGen/AArch64/nest-register.ll b/test/CodeGen/AArch64/nest-register.ll
new file mode 100644
index 000000000000..9c659fb74ec4
--- /dev/null
+++ b/test/CodeGen/AArch64/nest-register.ll
@@ -0,0 +1,23 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+; Tests that the 'nest' parameter attribute causes the relevant parameter to be
+; passed in the right register.
+
+define i8* @nest_receiver(i8* nest %arg) nounwind {
+; CHECK-LABEL: nest_receiver:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: mov x0, x18
+; CHECK-NEXT: ret
+
+  ret i8* %arg
+}
+
+define i8* @nest_caller(i8* %arg) nounwind {
+; CHECK-LABEL: nest_caller:
+; CHECK: mov x18, x0
+; CHECK-NEXT: bl nest_receiver
+; CHECK: ret
+
+  %result = call i8* @nest_receiver(i8* nest %arg)
+  ret i8* %result
+}
diff --git a/test/CodeGen/AArch64/xbfiz.ll b/test/CodeGen/AArch64/xbfiz.ll
new file mode 100644
index 000000000000..f763400d7f6a
--- /dev/null
+++ b/test/CodeGen/AArch64/xbfiz.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+
+define i64 @sbfiz64(i64 %v) {
+; CHECK-LABEL: sbfiz64:
+; CHECK: sbfiz	x0, x0, #1, #16
+  %shl = shl i64 %v, 48
+  %shr = ashr i64 %shl, 47
+  ret i64 %shr
+}
+
+define i32 @sbfiz32(i32 %v) {
+; CHECK-LABEL: sbfiz32:
+; CHECK: sbfiz	w0, w0, #1, #14
+  %shl = shl i32 %v, 18
+  %shr = ashr i32 %shl, 17
+  ret i32 %shr
+}
+
+define i64 @ubfiz64(i64 %v) {
+; CHECK-LABEL: ubfiz64:
+; CHECK: ubfiz	x0, x0, #36, #11
+  %shl = shl i64 %v, 53
+  %shr = lshr i64 %shl, 17
+  ret i64 %shr
+}
+
+define i32 @ubfiz32(i32 %v) {
+; CHECK-LABEL: ubfiz32:
+; CHECK: ubfiz	w0, w0, #6, #24
+  %shl = shl i32 %v, 8
+  %shr = lshr i32 %shl, 2
+  ret i32 %shr
+}
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
index eae095eb8449..a3ae3c3aea16 100644
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@@ -3,8 +3,9 @@
 declare i32 @llvm.SI.tid() readnone
 
 ; SI-LABEL: {{^}}test_array_ptr_calc:
-; SI: v_mul_lo_i32
-; SI: v_mul_hi_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_i32
+; SI: s_endpgm
 define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.SI.tid() readnone
   %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index e7e13d6178c4..5e4654abd91b 100644
--- a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt,+unsafe-ds-offset-folding < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 declare void @llvm.AMDGPU.barrier.local() #1
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll
index 5929898f8bd8..ec04f8b1acd6 100644
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1,10 +1,10 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
 
 ; FIXME: We don't get cases where the address was an SGPR because we
 ; get a copy to the address register for each one.
 
 @lds = addrspace(3) global [512 x float] undef, align 4
- @lds.f64 = addrspace(3) global [512 x double] undef, align 8
+@lds.f64 = addrspace(3) global [512 x double] undef, align 8
 
 ; SI-LABEL: @simple_read2_f32
 ; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
diff --git a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index 9ea9a5a2617b..d362c46bbf96 100644
--- a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -1,16 +1,17 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
 
-; XFAIL: *
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 
+; offset0 is larger than offset1
+
 ; SI-LABEL: {{^}}offset_order:
 
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
-; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:0 offset1:4
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:1
+; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}}
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:3 offset1:2
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
 
 define void @offset_order(float addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
new file mode 100644
index 000000000000..842c2d8bc339
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -0,0 +1,246 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=CI %s
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+@lds.v2 = addrspace(3) global [512 x <2 x float>] undef, align 4
+@lds.v3 = addrspace(3) global [512 x <3 x float>] undef, align 4
+@lds.v4 = addrspace(3) global [512 x <4 x float>] undef, align 4
+@lds.v8 = addrspace(3) global [512 x <8 x float>] undef, align 4
+@lds.v16 = addrspace(3) global [512 x <16 x float>] undef, align 4
+
+; CI-LABEL: {{^}}simple_read2_v2f32_superreg_align4:
+; CI: ds_read2_b32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
+; CI: s_waitcnt lgkmcnt(0)
+; CI: buffer_store_dwordx2 [[RESULT]]
+; CI: s_endpgm
+define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds  [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
+  %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4
+  %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
+  store <2 x float> %val0, <2 x float> addrspace(1)* %out.gep
+  ret void
+}
+
+; CI-LABEL: {{^}}simple_read2_v2f32_superreg:
+; CI: ds_read_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}}
+; CI: s_waitcnt lgkmcnt(0)
+; CI: buffer_store_dwordx2 [[RESULT]]
+; CI: s_endpgm
+define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
+  %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0
+  %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
+  store <2 x float> %val0, <2 x float> addrspace(1)* %out.gep
+  ret void
+}
+
+; FIXME: Shuffling to new superregister
+; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Y:[0-9]+]]:[[REG_X:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Y:[0-9]+]], v[[REG_Y]]
+; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Z:[0-9]+]], v[[REG_Z]]
+; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[COPY_REG_Z]], v[[REG_X]]
+; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[COPY_REG_Y]]
+; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]]
+; CI: buffer_store_dword v[[ADD2]]
+; CI: s_endpgm
+define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
+  %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4
+  %elt0 = extractelement <4 x float> %val0, i32 0
+  %elt1 = extractelement <4 x float> %val0, i32 1
+  %elt2 = extractelement <4 x float> %val0, i32 2
+  %elt3 = extractelement <4 x float> %val0, i32 3
+
+  %add0 = fadd float %elt0, %elt2
+  %add1 = fadd float %elt1, %elt3
+  %add2 = fadd float %add0, %add1
+
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %add2, float addrspace(1)* %out.gep
+  ret void
+}
+
+; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}}
+; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]]
+; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]]
+; CI: buffer_store_dword v[[ADD1]]
+; CI: s_endpgm
+define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i
+  %val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4
+  %elt0 = extractelement <3 x float> %val0, i32 0
+  %elt1 = extractelement <3 x float> %val0, i32 1
+  %elt2 = extractelement <3 x float> %val0, i32 2
+
+  %add0 = fadd float %elt0, %elt2
+  %add1 = fadd float %add0, %elt1
+
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %add1, float addrspace(1)* %out.gep
+  ret void
+}
+
+; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align8:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; CI: buffer_store_dwordx4
+; CI: s_endpgm
+define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
+  %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8
+  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
+  store <4 x float> %val0, <4 x float> addrspace(1)* %out.gep
+  ret void
+}
+
+; CI-LABEL: {{^}}simple_read2_v4f32_superreg:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; CI: buffer_store_dwordx4
+; CI: s_endpgm
+define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
+  %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0
+  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
+  store <4 x float> %val0, <4 x float> addrspace(1)* %out.gep
+  ret void
+}
+
+; CI-LABEL: {{^}}simple_read2_v8f32_superreg:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: s_endpgm
+define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i
+  %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0
+  %out.gep = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %out, i32 %x.i
+  store <8 x float> %val0, <8 x float> addrspace(1)* %out.gep
+  ret void
+}
+
+; CI-LABEL: {{^}}simple_read2_v16f32_superreg:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+
+; CI: s_waitcnt lgkmcnt(0)
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: buffer_store_dword
+; CI: s_endpgm
+define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i
+  %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0
+  %out.gep = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %out, i32 %x.i
+  store <16 x float> %val0, <16 x float> addrspace(1)* %out.gep
+  ret void
+}
+
+; Do scalar loads into the super register we need.
+; CI-LABEL: {{^}}simple_read2_v2f32_superreg_scalar_loads_align4:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
+; CI-NOT: v_mov
+; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
+; CI: s_endpgm
+define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
+
+  %val0 = load float, float addrspace(3)* %arrayidx0
+  %val1 = load float, float addrspace(3)* %arrayidx1
+
+  %vec.0 = insertelement <2 x float> undef, float %val0, i32 0
+  %vec.1 = insertelement <2 x float> %vec.0, float %val1, i32 1
+
+  %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
+  store <2 x float> %vec.1, <2 x float> addrspace(1)* %out.gep
+  ret void
+}
+
+; Do scalar loads into the super register we need.
+; CI-LABEL: {{^}}simple_read2_v4f32_superreg_scalar_loads_align4:
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; CI-NOT: v_mov
+; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
+; CI: s_endpgm
+define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
+  %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 2
+  %arrayidx3 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 3
+
+  %val0 = load float, float addrspace(3)* %arrayidx0
+  %val1 = load float, float addrspace(3)* %arrayidx1
+  %val2 = load float, float addrspace(3)* %arrayidx2
+  %val3 = load float, float addrspace(3)* %arrayidx3
+
+  %vec.0 = insertelement <4 x float> undef, float %val0, i32 0
+  %vec.1 = insertelement <4 x float> %vec.0, float %val1, i32 1
+  %vec.2 = insertelement <4 x float> %vec.1, float %val2, i32 2
+  %vec.3 = insertelement <4 x float> %vec.2, float %val3, i32 3
+
+  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
+  store <4 x float> %vec.3, <4 x float> addrspace(1)* %out.gep
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll
index 54b3b45636d6..e2e441214b4a 100644
--- a/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll
index b553d3459e40..d4973e377b59 100644
--- a/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
@@ -25,7 +25,7 @@ define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1
 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -405,6 +405,19 @@ define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, f
   ret void
 }
 
+; CI-LABEL: {{^}}simple_write2_v4f32_superreg_align4:
+; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}}
+; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}}
+; CI: s_endpgm
+define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
+  %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
+  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i
+  store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4
+  ret void
+}
+
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.tgid.x() #1
 
diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll
index 1d9d881c5c7e..358aa6a9e363 100644
--- a/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -1,9 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
-
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 
-
 ; SI-LABEL: @simple_write2st64_one_val_f32_0_1
 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/fmuladd.ll b/test/CodeGen/AMDGPU/fmuladd.ll
index ae84d841021d..600f0cb83578 100644
--- a/test/CodeGen/AMDGPU/fmuladd.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 declare float @llvm.fabs.f32(float) nounwind readnone
 
 ; CHECK-LABEL: {{^}}fmuladd_f32:
-; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+; CHECK: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                          float addrspace(1)* %in2, float addrspace(1)* %in3) {
@@ -34,8 +34,8 @@ define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
 define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -53,8 +53,8 @@ define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 ; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
 define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -72,8 +72,8 @@ define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 ; CHECK-LABEL: {{^}}fadd_a_a_b_f32:
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
 define void @fadd_a_a_b_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) {
@@ -94,8 +94,8 @@ define void @fadd_a_a_b_f32(float addrspace(1)* %out,
 ; CHECK-LABEL: {{^}}fadd_b_a_a_f32:
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
 define void @fadd_b_a_a_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) {
@@ -116,8 +116,8 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out,
 ; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
 define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -136,8 +136,8 @@ define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1
 ; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
 define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -158,8 +158,8 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa
 ; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32
 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
-; CHECK: buffer_store_dword [[RESULT]]
+; CHECK: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
+; CHECK: buffer_store_dword [[R2]]
 define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
new file mode 100644
index 000000000000..2a01a621fc42
--- /dev/null
+++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GatherAllAliases gives up on trying to analyze cases where the
+; pointer may have been loaded from an aliased store, so make sure
+; that this works and allows moving the stores to a better chain to
+; allow them to be merged merged when it's clear the pointer is loaded
+; from constant/invariant memory.
+
+; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_global_pointer_load:
+; GCN: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]],
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
+; GCN: buffer_store_dword [[K]], [[PTR]]
+define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
+  %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0
+  %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
+  store i16 123, i16 addrspace(1)* %ptr, align 4
+  store i16 456, i16 addrspace(1)* %ptr.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_constant_pointer_load:
+; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}}
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
+; GCN: buffer_store_dword [[K]], s{{\[}}[[SPTR_LO]]:
+define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
+  %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0
+  %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
+  store i16 123, i16 addrspace(1)* %ptr, align 4
+  store i16 456, i16 addrspace(1)* %ptr.1
+  ret void
+}
+
+!0 = !{}
+
+attributes #0 = { nounwind }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
index 4e4c2ec7791a..a64dd0ebd2dd 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
@@ -5,7 +5,7 @@ declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone
 
 ; FUNC-LABEL: {{^}}test_lrp:
 ; SI: v_sub_f32
-; SI: v_mad_f32
+; SI: v_mac_f32_e32
 define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind {
   %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone
   store float %mad, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll
index f5f124d915a5..d0e49243ffa7 100644
--- a/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.ll
@@ -9,8 +9,8 @@
 ; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
 ; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
 ; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
-; SI: v_cmp_le_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0.5, |[[SUB]]|
-; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]]
+; SI: v_cmp_le_f32_e64 vcc, 0.5, |[[SUB]]|
+; SI: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]]
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
 ; SI: buffer_store_dword [[RESULT]]
 
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
index bc071628ead0..c98f851f2b93 100644
--- a/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -19,7 +19,7 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
 
 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
 
@@ -29,7 +29,8 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
 
-; SI: buffer_store_dword [[RESULT]]
+; SI-DENORM: buffer_store_dword [[RESULT]]
+; SI-STD: buffer_store_dword [[C]]
 define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -54,8 +55,8 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
 
-; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
-; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]]
 
 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
@@ -64,8 +65,10 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp
 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
 
-; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
 define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
@@ -96,13 +99,14 @@ define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
 
 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
 
-; SI: buffer_store_dword [[RESULT]]
+; SI-DENORM: buffer_store_dword [[RESULT]]
+; SI-STD: buffer_store_dword [[C]]
 define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -482,7 +486,7 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 
 ; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+; SI-STD: v_mac_f32_e32 [[TMP]], [[B]], [[A]]
 
 ; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
@@ -492,7 +496,8 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
 
-; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DENORM: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-STD: buffer_store_dword [[TMP]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
 define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/mad-sub.ll b/test/CodeGen/AMDGPU/mad-sub.ll
index aa4194ff6106..24ff23a4cfc1 100644
--- a/test/CodeGen/AMDGPU/mad-sub.ll
+++ b/test/CodeGen/AMDGPU/mad-sub.ll
@@ -123,7 +123,7 @@ define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, fl
 }
 
 ; FUNC-LABEL: {{^}}neg_neg_mad_f32:
-; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_mac_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
   %tid.ext = sext i32 %tid to i64
@@ -172,8 +172,8 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float
 ; FUNC-LABEL: {{^}}fsub_c_fadd_a_a:
 ; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
-; SI: buffer_store_dword [[RESULT]]
+; SI: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
+; SI: buffer_store_dword [[R2]]
 define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index 933bb016d2c9..2e90cf10a3b5 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; GCN-LABEL: {{^}}madak_f32:
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
-; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000
+; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
 define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -34,8 +34,8 @@ define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]]
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]]
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]]
+; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]]
 ; GCN: s_endpgm
 define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -105,7 +105,7 @@ define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN-NOT: v_madak_f32
-; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]]
+; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
 define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -124,7 +124,7 @@ define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN-NOT: v_madak_f32
-; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]]
+; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
 define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -140,7 +140,7 @@ define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float add
 
 ; GCN-LABEL: {{^}}s_s_madak_f32:
 ; GCN-NOT: v_madak_f32
-; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %mul = fmul float %a, %b
   %madak = fadd float %mul, 10.0
diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll
index ba7bb221a99a..f8e14e34af67 100644
--- a/test/CodeGen/AMDGPU/madmk.ll
+++ b/test/CodeGen/AMDGPU/madmk.ll
@@ -28,8 +28,8 @@ define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]]
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]]
+; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]]
+; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]]
 ; GCN: s_endpgm
 define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -59,7 +59,7 @@ define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
 ; GCN-LABEL: {{^}}madmk_inline_imm_f32:
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]]
+; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]]
 define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -77,7 +77,7 @@ define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
 
 ; GCN-LABEL: {{^}}s_s_madmk_f32:
 ; GCN-NOT: v_madmk_f32
-; GCN: v_mad_f32
+; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
 define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -107,7 +107,7 @@ define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)*
 
 ; GCN-LABEL: {{^}}scalar_vector_madmk_f32:
 ; GCN-NOT: v_madmk_f32
-; GCN: v_mad_f32
+; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
 define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/mul_uint24.ll b/test/CodeGen/AMDGPU/mul_uint24.ll
index e640a7cd69f6..8a0e71d739be 100644
--- a/test/CodeGen/AMDGPU/mul_uint24.ll
+++ b/test/CodeGen/AMDGPU/mul_uint24.ll
@@ -52,16 +52,18 @@ entry:
 ; FUNC_LABEL: {{^}}mul24_i64:
 ; EG; MUL_UINT24
 ; EG: MULHI
-; SI: v_mul_u32_u24
 ; FIXME: SI support 24-bit mulhi
-; SI: v_mul_hi_u32
-define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+
+; SI-DAG: v_mul_u32_u24
+; SI-DAG: v_mul_hi_u32
+; SI: s_endpgm
+define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
 entry:
-  %0 = shl i64 %a, 40
-  %a_24 = lshr i64 %0, 40
-  %1 = shl i64 %b, 40
-  %b_24 = lshr i64 %1, 40
-  %2 = mul i64 %a_24, %b_24
-  store i64 %2, i64 addrspace(1)* %out
+  %tmp0 = shl i64 %a, 40
+  %a_24 = lshr i64 %tmp0, 40
+  %tmp1 = shl i64 %b, 40
+  %b_24 = lshr i64 %tmp1, 40
+  %tmp2 = mul i64 %a_24, %b_24
+  store i64 %tmp2, i64 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
index 59082c65cc8a..94758ad84c18 100644
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -6,10 +6,10 @@
 
 
 ; FUNC-LABEL: {{^}}select_v4i8:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
 define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
   %cmp = icmp eq i8 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
@@ -18,10 +18,10 @@ define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b,
 }
 
 ; FUNC-LABEL: {{^}}select_v4i16:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
 define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
@@ -30,8 +30,8 @@ define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16>
 }
 
 ; FUNC-LABEL: {{^}}select_v2i32:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx2
 define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
@@ -41,10 +41,10 @@ define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32>
 }
 
 ; FUNC-LABEL: {{^}}select_v4i32:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx4
 define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
@@ -54,14 +54,14 @@ define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32>
 }
 
 ; FUNC-LABEL: {{^}}select_v8i32:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
 define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
@@ -88,14 +88,14 @@ define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x f
 }
 
 ; FUNC-LABEL: {{^}}select_v8f32:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
 define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
@@ -104,10 +104,10 @@ define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x f
 }
 
 ; FUNC-LABEL: {{^}}select_v2f64:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
 define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
@@ -116,14 +116,14 @@ define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x
 }
 
 ; FUNC-LABEL: {{^}}select_v4f64:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
 define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
@@ -132,22 +132,22 @@ define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x
 }
 
 ; FUNC-LABEL: {{^}}select_v8f64:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
 define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
diff --git a/test/CodeGen/AMDGPU/select64.ll b/test/CodeGen/AMDGPU/select64.ll
index 5cebb30dc72e..13fb575b2b15 100644
--- a/test/CodeGen/AMDGPU/select64.ll
+++ b/test/CodeGen/AMDGPU/select64.ll
@@ -55,8 +55,8 @@ define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspa
 ; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0
 ; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]]
 ; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]]
-; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}}
-; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}}
+; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}}
+; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}}
 ; CHECK: s_endpgm
 define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index 53b63dc4b8ad..6f81a39ed96a 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -1,6 +1,9 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; XUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
 
 ;EG: {{^}}shl_v2i32:
 ;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -178,3 +181,32 @@ define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
   ret void
 }
+
+; Make sure load width gets reduced to i32 load.
+; GCN-LABEL: {{^}}s_shl_32_i64:
+; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
+; GCN-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
+  %result = shl i64 %a, 32
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_shl_32_i64:
+; GCN-DAG: buffer_load_dword v[[LO_A:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
+define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep.in
+  %result = shl i64 %a, 32
+  store i64 %result, i64 addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index da4e91db3a38..0db7cdc171b5 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -12,11 +12,11 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
 
 ; FIXME: select on 0, 0
 ; SI-LABEL: {{^}}sint_to_fp_i1_f64:
-; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI: v_cmp_eq_i32_e64 vcc,
 ; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
-; uses an SGPR for [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
+; uses an SGPR (implicit vcc).
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index 4904d7fa1bd0..0dad91e709d9 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -1,7 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
+declare i32 @llvm.r600.read.tidig.x() #0
+
 ; FUNC-LABEL: {{^}}lshr_i32:
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
@@ -184,3 +186,32 @@ define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
   ret void
 }
+
+; Make sure load width gets reduced to i32 load.
+; GCN-LABEL: {{^}}s_lshr_32_i64:
+; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}}
+; GCN-DAG: s_mov_b32 s[[SHI:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
+  %result = lshr i64 %a, 32
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_lshr_32_i64:
+; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}}
+define void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep.in
+  %result = lshr i64 %a, 32
+  store i64 %result, i64 addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index dfec8eb15cb7..6f608df5e9f5 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -72,11 +72,11 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
 
 ; FIXME: select on 0, 0
 ; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
-; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
-; uses an SGPR for [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
+; SI: v_cmp_eq_i32_e64 vcc
+; We can't fold the SGPRs into v_cndmask_b32_e32, because it already
+; uses an SGPR (implicit vcc).
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll
new file mode 100644
index 000000000000..a4eaec3403c9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -0,0 +1,155 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}mac_vvv:
+; GCN: buffer_load_dword [[A:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; GCN: buffer_load_dword [[B:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:4
+; GCN: buffer_load_dword [[C:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:8
+; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN: buffer_store_dword [[C]]
+define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+  %a = load float, float addrspace(1)* %in
+  %b = load float, float addrspace(1)* %b_ptr
+  %c = load float, float addrspace(1)* %c_ptr
+
+  %tmp0 = fmul float %a, %b
+  %tmp1 = fadd float %tmp0, %c
+  store float %tmp1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}mad_inline_sgpr_inline:
+; GCN-NOT: v_mac_f32
+; GCN: v_mad_f32 v{{[0-9]}}, 0.5, s{{[0-9]+}}, 0.5
+define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) {
+entry:
+  %tmp0 = fmul float 0.5, %in
+  %tmp1 = fadd float %tmp0, 0.5
+  store float %tmp1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}mad_vvs:
+; GCN-NOT: v_mac_f32
+; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) {
+entry:
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+
+  %a = load float, float addrspace(1)* %in
+  %b = load float, float addrspace(1)* %b_ptr
+
+  %tmp0 = fmul float %a, %b
+  %tmp1 = fadd float %tmp0, %c
+  store float %tmp1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}mac_ssv:
+; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) {
+entry:
+  %c = load float, float addrspace(1)* %in
+
+  %tmp0 = fmul float %a, %a
+  %tmp1 = fadd float %tmp0, %c
+  store float %tmp1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}mac_mad_same_add:
+; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
+; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
+define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+  %d_ptr = getelementptr float, float addrspace(1)* %in, i32 3
+  %e_ptr = getelementptr float, float addrspace(1)* %in, i32 4
+
+  %a = load float, float addrspace(1)* %in
+  %b = load float, float addrspace(1)* %b_ptr
+  %c = load float, float addrspace(1)* %c_ptr
+  %d = load float, float addrspace(1)* %d_ptr
+  %e = load float, float addrspace(1)* %e_ptr
+
+  %tmp0 = fmul float %a, %b
+  %tmp1 = fadd float %tmp0, %c
+
+  %tmp2 = fmul float %d, %e
+  %tmp3 = fadd float %tmp2, %c
+
+  %out1 = getelementptr float, float addrspace(1)* %out, i32 1
+  store float %tmp1, float addrspace(1)* %out
+  store float %tmp3, float addrspace(1)* %out1
+  ret void
+}
+
+; There is no advantage to using v_mac when one of the operands is negated
+; and v_mad accepts more operand types.
+
+; GCN-LABEL: {{^}}mad_neg_src0:
+; GCN-NOT: v_mac_f32
+; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+define void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+  %a = load float, float addrspace(1)* %in
+  %b = load float, float addrspace(1)* %b_ptr
+  %c = load float, float addrspace(1)* %c_ptr
+
+  %neg_a = fsub float 0.0, %a
+  %tmp0 = fmul float %neg_a, %b
+  %tmp1 = fadd float %tmp0, %c
+
+  store float %tmp1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}mad_neg_src1:
+; GCN-NOT: v_mac_f32
+; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+define void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+  %a = load float, float addrspace(1)* %in
+  %b = load float, float addrspace(1)* %b_ptr
+  %c = load float, float addrspace(1)* %c_ptr
+
+  %neg_b = fsub float 0.0, %b
+  %tmp0 = fmul float %a, %neg_b
+  %tmp1 = fadd float %tmp0, %c
+
+  store float %tmp1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}mad_neg_src2:
+; GCN-NOT: v_mac
+; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
+define void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+  %a = load float, float addrspace(1)* %in
+  %b = load float, float addrspace(1)* %b_ptr
+  %c = load float, float addrspace(1)* %c_ptr
+
+  %neg_c = fsub float 0.0, %c
+  %tmp0 = fmul float %a, %b
+  %tmp1 = fadd float %tmp0, %neg_c
+
+  store float %tmp1, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { "true" "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll
index a3014b03d2b3..dc1f1ea11b01 100644
--- a/test/CodeGen/AMDGPU/vselect.ll
+++ b/test/CodeGen/AMDGPU/vselect.ll
@@ -1,14 +1,14 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=VI --check-prefix=FUNC %s
+
+;FUNC-LABEL: {{^}}test_select_v2i32:
 
-;EG: {{^}}test_select_v2i32:
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: {{^}}test_select_v2i32:
-;SI: v_cndmask_b32_e64
 ;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e32
 
 define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
 entry:
@@ -20,13 +20,13 @@ entry:
   ret void
 }
 
-;EG: {{^}}test_select_v2f32:
+;FUNC-LABEL: {{^}}test_select_v2f32:
+
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: {{^}}test_select_v2f32:
-;SI: v_cndmask_b32_e64
 ;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e32
 
 define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
 entry:
@@ -38,17 +38,19 @@ entry:
   ret void
 }
 
-;EG: {{^}}test_select_v4i32:
+;FUNC-LABEL: {{^}}test_select_v4i32:
+
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: {{^}}test_select_v4i32:
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e64
+; FIXME: The shrinking does not happen on tonga
+
+;SI: v_cndmask_b32
+;SI: v_cndmask_b32
+;SI: v_cndmask_b32
+;SI: v_cndmask_b32
 
 define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
 entry:
@@ -60,7 +62,7 @@ entry:
   ret void
 }
 
-;EG: {{^}}test_select_v4f32:
+;FUNC-LABEL: {{^}}test_select_v4f32:
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index 089db59eabc7..ddb920af29d8 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -42,8 +42,8 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 
 ; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
 ; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}
-; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]]
+; SI: s_xor_b64 [[XOR:vcc]], [[CMP0]], [[CMP1]]
+; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
diff --git a/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll b/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
index c93d2a2d34fb..ac5b6f9c9708 100644
--- a/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
+++ b/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
@@ -25,8 +25,7 @@ entry:
   ;CHECK: push {r7, lr}
   ;CHECK: sub sp, #4
   ;CHECK: add r0, sp, #12
-  ;CHECK: str r2, [sp, #16]
-  ;CHECK: str r1, [sp, #12]
+  ;CHECK: strd r1, r2, [sp, #12]
   ;CHECK: bl  fooUseStruct
   call void @fooUseStruct(%st_t* %p1)
   ret void
diff --git a/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll b/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
index 438b021a040b..d3aa2331d45f 100644
--- a/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
+++ b/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
@@ -9,8 +9,8 @@ define void @foo(%struct4bytes* byval %p0, ; --> R0
 ) {
 ;CHECK:  sub  sp, sp, #16
 ;CHECK:  push  {r11, lr}
-;CHECK:  add  r11, sp, #8
-;CHECK:  stm  r11, {r0, r1, r2, r3}
+;CHECK:  add  r12, sp, #8
+;CHECK:  stm  r12, {r0, r1, r2, r3}
 ;CHECK:  add  r0, sp, #12
 ;CHECK:  bl  useInt
 ;CHECK:  pop  {r11, lr}
diff --git a/test/CodeGen/ARM/Windows/hard-float.ll b/test/CodeGen/ARM/Windows/hard-float.ll
index f7b7ec273ce8..1ce02813dfc2 100644
--- a/test/CodeGen/ARM/Windows/hard-float.ll
+++ b/test/CodeGen/ARM/Windows/hard-float.ll
@@ -1,4 +1,8 @@
-; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s | FileCheck %s
+; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s \
+; RUN:   | FileCheck %s -check-prefix CHECK-WIN
+
+; RUN: llc -mtriple=thumbv7-windows-gnu -mcpu=cortex-a9 -o - %s \
+; RUN:   | FileCheck %s -check-prefix CHECK-GNU
 
 define float @function(float %f, float %g) nounwind {
 entry:
@@ -6,5 +10,7 @@ entry:
   ret float %h
 }
 
-; CHECK: vadd.f32 s0, s0, s1
+; CHECK-WIN: vadd.f32 s0, s0, s1
+
+; CHECK-GNU: vadd.f32 s0, s0, s1
 
diff --git a/test/CodeGen/ARM/Windows/long-calls.ll b/test/CodeGen/ARM/Windows/long-calls.ll
index 21c95fac91c5..4e5bdce146f0 100644
--- a/test/CodeGen/ARM/Windows/long-calls.ll
+++ b/test/CodeGen/ARM/Windows/long-calls.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -arm-long-calls -o - %s \
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -mattr=+long-calls -o - %s \
 ; RUN:    | FileCheck %s
 
 declare arm_aapcs_vfpcc void @callee()
diff --git a/test/CodeGen/ARM/Windows/no-arm-mode.ll b/test/CodeGen/ARM/Windows/no-arm-mode.ll
index 6db031fc9169..30353640a4cc 100644
--- a/test/CodeGen/ARM/Windows/no-arm-mode.ll
+++ b/test/CodeGen/ARM/Windows/no-arm-mode.ll
@@ -1,5 +1,10 @@
 ; RUN: not llc -mtriple=armv7-windows-itanium -mcpu=cortex-a9 -o /dev/null %s 2>&1 \
-; RUN:  | FileCheck %s
+; RUN:   | FileCheck %s -check-prefix CHECK-WIN
 
-; CHECK: does not support ARM mode execution
+; RUN: not llc -mtriple=armv7-windows-gnu -mcpu=cortex-a9 -o /dev/null %s 2>&1 \
+; RUN:   | FileCheck %s -check-prefix CHECK-GNU
+
+; CHECK-WIN: does not support ARM mode execution
+
+; CHECK-GNU: does not support ARM mode execution
 
diff --git a/test/CodeGen/ARM/Windows/pic.ll b/test/CodeGen/ARM/Windows/pic.ll
index 9ef7c35c5530..df4c400035a3 100644
--- a/test/CodeGen/ARM/Windows/pic.ll
+++ b/test/CodeGen/ARM/Windows/pic.ll
@@ -1,5 +1,8 @@
 ; RUN: llc -mtriple thumbv7-windows-itanium -relocation-model pic -filetype asm -o - %s \
-; RUN:    | FileCheck %s
+; RUN:   | FileCheck %s -check-prefix CHECK-WIN
+
+; RUN: llc -mtriple thumbv7-windows-gnu -relocation-model pic -filetype asm -o - %s \
+; RUN:   | FileCheck %s -check-prefix CHECK-GNU
 
 @external = external global i8
 
@@ -9,8 +12,12 @@ entry:
   ret i8 %0
 }
 
-; CHECK-LABEL: return_external
-; CHECK: movw r0, :lower16:external
-; CHECK: movt r0, :upper16:external
-; CHECK: ldrb r0, [r0]
+; CHECK-WIN-LABEL: return_external
+; CHECK-WIN: movw r0, :lower16:external
+; CHECK-WIN: movt r0, :upper16:external
+; CHECK-WIN: ldrb r0, [r0]
 
+; CHECK-GNU-LABEL: return_external
+; CHECK-GNU: movw r0, :lower16:external
+; CHECK-GNU: movt r0, :upper16:external
+; CHECK-GNU: ldrb r0, [r0]
diff --git a/test/CodeGen/ARM/Windows/structors.ll b/test/CodeGen/ARM/Windows/structors.ll
index 874b5bf35b81..eff1c7f4b384 100644
--- a/test/CodeGen/ARM/Windows/structors.ll
+++ b/test/CodeGen/ARM/Windows/structors.ll
@@ -1,4 +1,8 @@
-; RUN: llc -mtriple thumbv7-windows-itanium -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv7-windows-itanium -o - %s \
+; RUN:   | FileCheck %s -check-prefix CHECK-WIN
+
+; RUN: llc -mtriple thumbv7-windows-gnu -o - %s \
+; RUN:   | FileCheck %s -check-prefix CHECK-GNU
 
 @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @function, i8* null }]
 
@@ -7,6 +11,8 @@ entry:
   ret void
 }
 
-; CHECK: .section .CRT$XCU,"dr"
-; CHECK: .long function
+; CHECK-WIN: .section .CRT$XCU,"dr"
+; CHECK-WIN: .long function
 
+; CHECK-GNU: .section .ctors,"dw"
+; CHECK-GNU: .long function
diff --git a/test/CodeGen/ARM/Windows/trivial-gnu-object.ll b/test/CodeGen/ARM/Windows/trivial-gnu-object.ll
new file mode 100644
index 000000000000..a242f39601cb
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/trivial-gnu-object.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=thumbv7-windows-itanium -filetype=obj -o - %s | llvm-objdump -d - | FileCheck %s
+; RUN: llc -mtriple=thumbv7-windows-gnu -filetype=obj -o - %s | llvm-objdump -d - | FileCheck %s
+
+define void @foo() {
+; CHECK: file format COFF-ARM
+
+; CHECK-LABEL: foo:
+; CHECK: bx lr
+  ret void
+}
diff --git a/test/CodeGen/ARM/arm-returnaddr.ll b/test/CodeGen/ARM/arm-returnaddr.ll
index 4266572b077f..26f8c67bb15a 100644
--- a/test/CodeGen/ARM/arm-returnaddr.ll
+++ b/test/CodeGen/ARM/arm-returnaddr.ll
@@ -8,7 +8,6 @@
 define i8* @rt0(i32 %x) nounwind readnone {
 entry:
 ; CHECK-LABEL: rt0:
-; CHECK: {r7, lr}
 ; CHECK: mov r0, lr
   %0 = tail call i8* @llvm.returnaddress(i32 0)
   ret i8* %0
@@ -17,10 +16,9 @@ entry:
 define i8* @rt2() nounwind readnone {
 entry:
 ; CHECK-LABEL: rt2:
-; CHECK: {r7, lr}
 ; CHECK: ldr r[[R0:[0-9]+]], [r7]
-; CHECK: ldr r0, [r0]
-; CHECK: ldr r0, [r0, #4]
+; CHECK: ldr r0, [r[[R0]]]
+; CHECK: ldr r0, [r[[R0]], #4]
   %0 = tail call i8* @llvm.returnaddress(i32 2)
   ret i8* %0
 }
diff --git a/test/CodeGen/ARM/byval-align.ll b/test/CodeGen/ARM/byval-align.ll
index a26b5a795756..8a506280dd57 100644
--- a/test/CodeGen/ARM/byval-align.ll
+++ b/test/CodeGen/ARM/byval-align.ll
@@ -28,8 +28,7 @@ define i32 @test_align8(i8*, [4 x i32]* byval align 8 %b) {
 ; CHECK: push {r4, r7, lr}
 ; CHECK: add r7, sp, #4
 
-; CHECK-DAG: str r2, [r7, #8]
-; CHECK-DAG: str r3, [r7, #12]
+; CHECK: strd r2, r3, [r7, #8]
 
 ; CHECK: ldr r0, [r7, #8]
 
diff --git a/test/CodeGen/ARM/cttz.ll b/test/CodeGen/ARM/cttz.ll
new file mode 100644
index 000000000000..dacfca505931
--- /dev/null
+++ b/test/CodeGen/ARM/cttz.ll
@@ -0,0 +1,90 @@
+; RUN: llc < %s -mtriple arm-eabi -mattr=+v6t2 | FileCheck %s
+; RUN: llc < %s -mtriple arm-eabi -mattr=+v6t2 -mattr=+neon | FileCheck %s
+
+; This test checks the @llvm.cttz.* intrinsics for integers.
+
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+
+;------------------------------------------------------------------------------
+
+define i8 @test_i8(i8 %a) {
+; CHECK-LABEL: test_i8:
+; CHECK: orr [[REG:r[0-9]+]], [[REG]], #256
+; CHECK: rbit
+; CHECK: clz
+  %tmp = call i8 @llvm.cttz.i8(i8 %a, i1 false)
+  ret i8 %tmp
+}
+
+define i16 @test_i16(i16 %a) {
+; CHECK-LABEL: test_i16:
+; CHECK: orr [[REG:r[0-9]+]], [[REG]], #65536
+; CHECK: rbit
+; CHECK: clz
+  %tmp = call i16 @llvm.cttz.i16(i16 %a, i1 false)
+  ret i16 %tmp
+}
+
+define i32 @test_i32(i32 %a) {
+; CHECK-LABEL: test_i32:
+; CHECK: rbit
+; CHECK: clz
+  %tmp = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %tmp
+}
+
+define i64 @test_i64(i64 %a) {
+; CHECK-LABEL: test_i64:
+; CHECK: rbit
+; CHECK: rbit
+; CHECK: cmp
+; CHECK: clz
+; CHECK: add
+; CHECK: clzne
+  %tmp = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %tmp
+}
+
+;------------------------------------------------------------------------------
+
+define i8 @test_i8_zero_undef(i8 %a) {
+; CHECK-LABEL: test_i8_zero_undef:
+; CHECK-NOT: orr
+; CHECK: rbit
+; CHECK: clz
+  %tmp = call i8 @llvm.cttz.i8(i8 %a, i1 true)
+  ret i8 %tmp
+}
+
+define i16 @test_i16_zero_undef(i16 %a) {
+; CHECK-LABEL: test_i16_zero_undef:
+; CHECK-NOT: orr
+; CHECK: rbit
+; CHECK: clz
+  %tmp = call i16 @llvm.cttz.i16(i16 %a, i1 true)
+  ret i16 %tmp
+}
+
+
+define i32 @test_i32_zero_undef(i32 %a) {
+; CHECK-LABEL: test_i32_zero_undef:
+; CHECK: rbit
+; CHECK: clz
+  %tmp = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  ret i32 %tmp
+}
+
+define i64 @test_i64_zero_undef(i64 %a) {
+; CHECK-LABEL: test_i64_zero_undef:
+; CHECK: rbit
+; CHECK: rbit
+; CHECK: cmp
+; CHECK: clz
+; CHECK: add
+; CHECK: clzne
+  %tmp = call i64 @llvm.cttz.i64(i64 %a, i1 true)
+  ret i64 %tmp
+}
diff --git a/test/CodeGen/ARM/cttz_vector.ll b/test/CodeGen/ARM/cttz_vector.ll
new file mode 100644
index 000000000000..9480d75db47a
--- /dev/null
+++ b/test/CodeGen/ARM/cttz_vector.ll
@@ -0,0 +1,383 @@
+; RUN: llc < %s -mtriple armv7-linux-gnueabihf -mattr=+neon | FileCheck %s
+
+; This test checks the @llvm.cttz.* intrinsics for vectors.
+
+declare <1 x i8> @llvm.cttz.v1i8(<1 x i8>, i1)
+declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1)
+declare <4 x i8> @llvm.cttz.v4i8(<4 x i8>, i1)
+declare <8 x i8> @llvm.cttz.v8i8(<8 x i8>, i1)
+declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1)
+
+declare <1 x i16> @llvm.cttz.v1i16(<1 x i16>, i1)
+declare <2 x i16> @llvm.cttz.v2i16(<2 x i16>, i1)
+declare <4 x i16> @llvm.cttz.v4i16(<4 x i16>, i1)
+declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1)
+
+declare <1 x i32> @llvm.cttz.v1i32(<1 x i32>, i1)
+declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1)
+declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
+
+declare <1 x i64> @llvm.cttz.v1i64(<1 x i64>, i1)
+declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
+
+;------------------------------------------------------------------------------
+
+define void @test_v1i8(<1 x i8>* %p) {
+; CHECK-LABEL: test_v1i8
+  %a = load <1 x i8>, <1 x i8>* %p
+  %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 false)
+  store <1 x i8> %tmp, <1 x i8>* %p
+  ret void
+}
+
+define void @test_v2i8(<2 x i8>* %p) {
+; CHECK-LABEL: test_v2i8:
+  %a = load <2 x i8>, <2 x i8>* %p
+  %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 false)
+  store <2 x i8> %tmp, <2 x i8>* %p
+  ret void
+}
+
+define void @test_v4i8(<4 x i8>* %p) {
+; CHECK-LABEL: test_v4i8:
+  %a = load <4 x i8>, <4 x i8>* %p
+  %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 false)
+  store <4 x i8> %tmp, <4 x i8>* %p
+  ret void
+}
+
+define void @test_v8i8(<8 x i8>* %p) {
+; CHECK-LABEL: test_v8i8:
+; CHECK: vldr		[[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i8	[[D2:d[0-9]+]], #0x1
+; CHECK: vneg.s8	[[D3:d[0-9]+]], [[D1]]
+; CHECK: vand		[[D1]], [[D1]], [[D3]]
+; CHECK: vsub.i8	[[D1]], [[D1]], [[D2]]
+; CHECK: vcnt.8		[[D1]], [[D1]]
+; CHECK: vstr		[[D1]], [r0]
+  %a = load <8 x i8>, <8 x i8>* %p
+  %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 false)
+  store <8 x i8> %tmp, <8 x i8>* %p
+  ret void
+}
+
+define void @test_v16i8(<16 x i8>* %p) {
+; CHECK-LABEL: test_v16i8:
+; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i8	[[Q2:q[0-9]+]], #0x1
+; CHECK: vneg.s8	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
+; CHECK: vsub.i8	[[Q1]], [[Q1]], [[Q2]]
+; CHECK: vcnt.8		[[Q1]], [[Q1]]
+; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+  %a = load <16 x i8>, <16 x i8>* %p
+  %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
+  store <16 x i8> %tmp, <16 x i8>* %p
+  ret void
+}
+
+define void @test_v1i16(<1 x i16>* %p) {
+; CHECK-LABEL: test_v1i16:
+  %a = load <1 x i16>, <1 x i16>* %p
+  %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 false)
+  store <1 x i16> %tmp, <1 x i16>* %p
+  ret void
+}
+
+define void @test_v2i16(<2 x i16>* %p) {
+; CHECK-LABEL: test_v2i16:
+  %a = load <2 x i16>, <2 x i16>* %p
+  %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 false)
+  store <2 x i16> %tmp, <2 x i16>* %p
+  ret void
+}
+
+define void @test_v4i16(<4 x i16>* %p) {
+; CHECK-LABEL: test_v4i16:
+; CHECK: vldr		[[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i16	[[D2:d[0-9]+]], #0x1
+; CHECK: vneg.s16	[[D3:d[0-9]+]], [[D1]]
+; CHECK: vand		[[D1]], [[D1]], [[D3]]
+; CHECK: vsub.i16	[[D1]], [[D1]], [[D2]]
+; CHECK: vcnt.8		[[D1]], [[D1]]
+; CHECK: vpaddl.u8	[[D1]], [[D1]]
+; CHECK: vstr		[[D1]], [r0]
+  %a = load <4 x i16>, <4 x i16>* %p
+  %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 false)
+  store <4 x i16> %tmp, <4 x i16>* %p
+  ret void
+}
+
+define void @test_v8i16(<8 x i16>* %p) {
+; CHECK-LABEL: test_v8i16:
+; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i16	[[Q2:q[0-9]+]], #0x1
+; CHECK: vneg.s16	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
+; CHECK: vsub.i16	[[Q1]], [[Q1]], [[Q2]]
+; CHECK: vcnt.8		[[Q1]], [[Q1]]
+; CHECK: vpaddl.u8	[[Q1]], [[Q1]]
+; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+  %a = load <8 x i16>, <8 x i16>* %p
+  %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
+  store <8 x i16> %tmp, <8 x i16>* %p
+  ret void
+}
+
+define void @test_v1i32(<1 x i32>* %p) {
+; CHECK-LABEL: test_v1i32:
+  %a = load <1 x i32>, <1 x i32>* %p
+  %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 false)
+  store <1 x i32> %tmp, <1 x i32>* %p
+  ret void
+}
+
+define void @test_v2i32(<2 x i32>* %p) {
+; CHECK-LABEL: test_v2i32:
+; CHECK: vldr		[[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i32	[[D2:d[0-9]+]], #0x1
+; CHECK: vneg.s32	[[D3:d[0-9]+]], [[D1]]
+; CHECK: vand		[[D1]], [[D1]], [[D3]]
+; CHECK: vsub.i32	[[D1]], [[D1]], [[D2]]
+; CHECK: vcnt.8		[[D1]], [[D1]]
+; CHECK: vpaddl.u8	[[D1]], [[D1]]
+; CHECK: vpaddl.u16	[[D1]], [[D1]]
+; CHECK: vstr		[[D1]], [r0]
+  %a = load <2 x i32>, <2 x i32>* %p
+  %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
+  store <2 x i32> %tmp, <2 x i32>* %p
+  ret void
+}
+
+define void @test_v4i32(<4 x i32>* %p) {
+; CHECK-LABEL: test_v4i32:
+; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i32	[[Q2:q[0-9]+]], #0x1
+; CHECK: vneg.s32	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
+; CHECK: vsub.i32	[[Q1]], [[Q1]], [[Q2]]
+; CHECK: vcnt.8		[[Q1]], [[Q1]]
+; CHECK: vpaddl.u8	[[Q1]], [[Q1]]
+; CHECK: vpaddl.u16	[[Q1]], [[Q1]]
+; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+  %a = load <4 x i32>, <4 x i32>* %p
+  %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
+  store <4 x i32> %tmp, <4 x i32>* %p
+  ret void
+}
+
+define void @test_v1i64(<1 x i64>* %p) {
+; CHECK-LABEL: test_v1i64:
+; CHECK: vldr		[[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i32	[[D2:d[0-9]+]], #0x0
+; CHECK: vmov.i64	[[D3:d[0-9]+]], #0xffffffffffffffff
+; CHECK: vsub.i64	[[D2]], [[D2]], [[D1]]
+; CHECK: vand		[[D1]], [[D1]], [[D2]]
+; CHECK: vadd.i64	[[D1]], [[D1]], [[D3]]
+; CHECK: vcnt.8		[[D1]], [[D1]]
+; CHECK: vpaddl.u8	[[D1]], [[D1]]
+; CHECK: vpaddl.u16	[[D1]], [[D1]]
+; CHECK: vpaddl.u32	[[D1]], [[D1]]
+; CHECK: vstr		[[D1]], [r0]
+  %a = load <1 x i64>, <1 x i64>* %p
+  %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 false)
+  store <1 x i64> %tmp, <1 x i64>* %p
+  ret void
+}
+
+define void @test_v2i64(<2 x i64>* %p) {
+; CHECK-LABEL: test_v2i64:
+; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i32	[[Q2:q[0-9]+]], #0x0
+; CHECK: vmov.i64	[[Q3:q[0-9]+]], #0xffffffffffffffff
+; CHECK: vsub.i64	[[Q2]], [[Q2]], [[Q1:q[0-9]+]]
+; CHECK: vand		[[Q1]], [[Q1]], [[Q2]]
+; CHECK: vadd.i64	[[Q1]], [[Q1]], [[Q3]]
+; CHECK: vcnt.8		[[Q1]], [[Q1]]
+; CHECK: vpaddl.u8	[[Q1]], [[Q1]]
+; CHECK: vpaddl.u16	[[Q1]], [[Q1]]
+; CHECK: vpaddl.u32	[[Q1]], [[Q1]]
+; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+  %a = load <2 x i64>, <2 x i64>* %p
+  %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
+  store <2 x i64> %tmp, <2 x i64>* %p
+  ret void
+}
+
+;------------------------------------------------------------------------------
+
+define void @test_v1i8_zero_undef(<1 x i8>* %p) {
+; CHECK-LABEL: test_v1i8_zero_undef
+  %a = load <1 x i8>, <1 x i8>* %p
+  %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 true)
+  store <1 x i8> %tmp, <1 x i8>* %p
+  ret void
+}
+
+define void @test_v2i8_zero_undef(<2 x i8>* %p) {
+; CHECK-LABEL: test_v2i8_zero_undef:
+  %a = load <2 x i8>, <2 x i8>* %p
+  %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true)
+  store <2 x i8> %tmp, <2 x i8>* %p
+  ret void
+}
+
+define void @test_v4i8_zero_undef(<4 x i8>* %p) {
+; CHECK-LABEL: test_v4i8_zero_undef:
+  %a = load <4 x i8>, <4 x i8>* %p
+  %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true)
+  store <4 x i8> %tmp, <4 x i8>* %p
+  ret void
+}
+
+define void @test_v8i8_zero_undef(<8 x i8>* %p) {
+; CHECK-LABEL: test_v8i8_zero_undef:
+; CHECK: vldr		[[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i8	[[D2:d[0-9]+]], #0x1
+; CHECK: vneg.s8	[[D3:d[0-9]+]], [[D1]]
+; CHECK: vand		[[D1]], [[D1]], [[D3]]
+; CHECK: vsub.i8	[[D1]], [[D1]], [[D2]]
+; CHECK: vcnt.8		[[D1]], [[D1]]
+; CHECK: vstr		[[D1]], [r0]
+  %a = load <8 x i8>, <8 x i8>* %p
+  %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true)
+  store <8 x i8> %tmp, <8 x i8>* %p
+  ret void
+}
+
+define void @test_v16i8_zero_undef(<16 x i8>* %p) {
+; CHECK-LABEL: test_v16i8_zero_undef:
+; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i8	[[Q2:q[0-9]+]], #0x1
+; CHECK: vneg.s8	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
+; CHECK: vsub.i8	[[Q1]], [[Q1]], [[Q2]]
+; CHECK: vcnt.8		[[Q1]], [[Q1]]
+; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+  %a = load <16 x i8>, <16 x i8>* %p
+  %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
+  store <16 x i8> %tmp, <16 x i8>* %p
+  ret void
+}
+
+define void @test_v1i16_zero_undef(<1 x i16>* %p) {
+; CHECK-LABEL: test_v1i16_zero_undef:
+  %a = load <1 x i16>, <1 x i16>* %p
+  %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 true)
+  store <1 x i16> %tmp, <1 x i16>* %p
+  ret void
+}
+
+define void @test_v2i16_zero_undef(<2 x i16>* %p) {
+; CHECK-LABEL: test_v2i16_zero_undef:
+  %a = load <2 x i16>, <2 x i16>* %p
+  %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true)
+  store <2 x i16> %tmp, <2 x i16>* %p
+  ret void
+}
+
+define void @test_v4i16_zero_undef(<4 x i16>* %p) {
+; CHECK-LABEL: test_v4i16_zero_undef:
+; CHECK: vldr		[[D1:d[0-9]+]], [r0]
+; CHECK: vneg.s16	[[D2:d[0-9]+]], [[D1]]
+; CHECK: vand		[[D1]], [[D1]], [[D2]]
+; CHECK: vmov.i16	[[D3:d[0-9]+]], #0xf
+; CHECK: vclz.i16	[[D1]], [[D1]]
+; CHECK: vsub.i16	[[D1]], [[D3]], [[D1]]
+; CHECK: vstr		[[D1]], [r0]
+  %a = load <4 x i16>, <4 x i16>* %p
+  %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true)
+  store <4 x i16> %tmp, <4 x i16>* %p
+  ret void
+}
+
+define void @test_v8i16_zero_undef(<8 x i16>* %p) {
+; CHECK-LABEL: test_v8i16_zero_undef:
+; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vneg.s16	[[Q2:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand		[[Q1]], [[Q1]], [[Q2]]
+; CHECK: vmov.i16	[[Q3:q[0-9]+]], #0xf
+; CHECK: vclz.i16	[[Q1]], [[Q1]]
+; CHECK: vsub.i16	[[Q1]], [[Q3]], [[Q1]]
+; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+  %a = load <8 x i16>, <8 x i16>* %p
+  %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
+  store <8 x i16> %tmp, <8 x i16>* %p
+  ret void
+}
+
+define void @test_v1i32_zero_undef(<1 x i32>* %p) {
+; CHECK-LABEL: test_v1i32_zero_undef:
+  %a = load <1 x i32>, <1 x i32>* %p
+  %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 true)
+  store <1 x i32> %tmp, <1 x i32>* %p
+  ret void
+}
+
+define void @test_v2i32_zero_undef(<2 x i32>* %p) {
+; CHECK-LABEL: test_v2i32_zero_undef:
+; CHECK: vldr		[[D1:d[0-9]+]], [r0]
+; CHECK: vneg.s32	[[D2:d[0-9]+]], [[D1]]
+; CHECK: vand		[[D1]], [[D1]], [[D2]]
+; CHECK: vmov.i32	[[D3:d[0-9]+]], #0x1f
+; CHECK: vclz.i32	[[D1]], [[D1]]
+; CHECK: vsub.i32	[[D1]], [[D3]], [[D1]]
+; CHECK: vstr		[[D1]], [r0]
+  %a = load <2 x i32>, <2 x i32>* %p
+  %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true)
+  store <2 x i32> %tmp, <2 x i32>* %p
+  ret void
+}
+
+define void @test_v4i32_zero_undef(<4 x i32>* %p) {
+; CHECK-LABEL: test_v4i32_zero_undef:
+; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vneg.s32	[[Q2:q[0-9]+]], [[Q1:q[0-9]+]]
+; CHECK: vand		[[Q1]], [[Q1]], [[Q2]]
+; CHECK: vmov.i32	[[Q3:q[0-9]+]], #0x1f
+; CHECK: vclz.i32	[[Q1]], [[Q1]]
+; CHECK: vsub.i32	[[Q1]], [[Q3]], [[Q1]]
+; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+  %a = load <4 x i32>, <4 x i32>* %p
+  %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
+  store <4 x i32> %tmp, <4 x i32>* %p
+  ret void
+}
+
+define void @test_v1i64_zero_undef(<1 x i64>* %p) {
+; CHECK-LABEL: test_v1i64_zero_undef:
+; CHECK: vldr		[[D1:d[0-9]+]], [r0]
+; CHECK: vmov.i32	[[D2:d[0-9]+]], #0x0
+; CHECK: vmov.i64	[[D3:d[0-9]+]], #0xffffffffffffffff
+; CHECK: vsub.i64	[[D2]], [[D2]], [[D1]]
+; CHECK: vand		[[D1]], [[D1]], [[D2]]
+; CHECK: vadd.i64	[[D1]], [[D1]], [[D3]]
+; CHECK: vcnt.8		[[D1]], [[D1]]
+; CHECK: vpaddl.u8	[[D1]], [[D1]]
+; CHECK: vpaddl.u16	[[D1]], [[D1]]
+; CHECK: vpaddl.u32	[[D1]], [[D1]]
+; CHECK: vstr		[[D1]], [r0]
+  %a = load <1 x i64>, <1 x i64>* %p
+  %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 true)
+  store <1 x i64> %tmp, <1 x i64>* %p
+  ret void
+}
+
+define void @test_v2i64_zero_undef(<2 x i64>* %p) {
+; CHECK-LABEL: test_v2i64_zero_undef:
+; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
+; CHECK: vmov.i32	[[Q2:q[0-9]+]], #0x0
+; CHECK: vmov.i64	[[Q3:q[0-9]+]], #0xffffffffffffffff
+; CHECK: vsub.i64	[[Q2]], [[Q2]], [[Q1:q[0-9]+]]
+; CHECK: vand		[[Q1]], [[Q1]], [[Q2]]
+; CHECK: vadd.i64	[[Q1]], [[Q1]], [[Q3]]
+; CHECK: vcnt.8		[[Q1]], [[Q1]]
+; CHECK: vpaddl.u8	[[Q1]], [[Q1]]
+; CHECK: vpaddl.u16	[[Q1]], [[Q1]]
+; CHECK: vpaddl.u32	[[Q1]], [[Q1]]
+; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+  %a = load <2 x i64>, <2 x i64>* %p
+  %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
+  store <2 x i64> %tmp, <2 x i64>* %p
+  ret void
+}
diff --git a/test/CodeGen/ARM/ctz.ll b/test/CodeGen/ARM/ctz.ll
deleted file mode 100644
index 2d88b0351cf3..000000000000
--- a/test/CodeGen/ARM/ctz.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc -mtriple=arm-eabi -mattr=+v6t2 %s -o - | FileCheck %s
-
-declare i32 @llvm.cttz.i32(i32, i1)
-
-define i32 @f1(i32 %a) {
-; CHECK-LABEL: f1:
-; CHECK: rbit
-; CHECK: clz
-  %tmp = call i32 @llvm.cttz.i32( i32 %a, i1 true )
-  ret i32 %tmp
-}
diff --git a/test/CodeGen/ARM/fast-isel-call.ll b/test/CodeGen/ARM/fast-isel-call.ll
index bd170f30d979..e382e78a9950 100644
--- a/test/CodeGen/ARM/fast-isel-call.ll
+++ b/test/CodeGen/ARM/fast-isel-call.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=+long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=+long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=+long-calls | FileCheck %s --check-prefix=THUMB-LONG
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=THUMB-NOVFP
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index 6b434b74ca79..1c7ff6879386 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-LONG
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=+long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=+long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=+long-calls -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-LONG
 
 ; Note that some of these tests assume that relocations are either
 ; movw/movt or constant pool loads. Different platforms will select
diff --git a/test/CodeGen/ARM/fast-isel-static.ll b/test/CodeGen/ARM/fast-isel-static.ll
index c3980cb51f67..200387cf8926 100644
--- a/test/CodeGen/ARM/fast-isel-static.ll
+++ b/test/CodeGen/ARM/fast-isel-static.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=static -arm-long-calls | FileCheck -check-prefix=CHECK-LONG %s
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=static -arm-long-calls | FileCheck -check-prefix=CHECK-LONG %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=static -mattr=+long-calls | FileCheck -check-prefix=CHECK-LONG %s
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=static -mattr=+long-calls | FileCheck -check-prefix=CHECK-LONG %s
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=static | FileCheck -check-prefix=CHECK-NORM %s
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=static | FileCheck -check-prefix=CHECK-NORM %s
 
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index f3e13671ac37..56cdcaedf900 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -3,6 +3,7 @@
 ; rdar://6949835
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK
 
 ; Magic ARM pair hints works best with linearscan / fast.
 
@@ -110,5 +111,73 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: strd_spill_ldrd_reload:
+; A8: strd r1, r0, [sp, #-8]!
+; M3: strd r1, r0, [sp, #-8]!
+; BASIC: strd r1, r0, [sp, #-8]!
+; GREEDY: strd r0, r1, [sp, #-8]!
+; CHECK: @ InlineAsm Start
+; CHECK: @ InlineAsm End
+; A8: ldrd r2, r1, [sp]
+; M3: ldrd r2, r1, [sp]
+; BASIC: ldrd r2, r1, [sp]
+; GREEDY: ldrd r1, r2, [sp]
+; CHECK: bl{{x?}} _extfunc
+define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) {
+  ; force %v0 and %v1 to be spilled
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{lr}"()
+  ; force the reloaded %v0, %v1 into different registers
+  call void @extfunc(i32 0, i32 %v0, i32 %v1, i32 7)
+  ret void
+}
+
+declare void @extfunc2(i32*, i32, i32)
+
+; CHECK-LABEL: ldrd_postupdate_dec:
+; CHECK: ldrd r1, r2, [r0], #-8
+; CHECK-NEXT: bl{{x?}} _extfunc
+define void @ldrd_postupdate_dec(i32* %p0) {
+  %p0.1 = getelementptr i32, i32* %p0, i32 1
+  %v0 = load i32, i32* %p0
+  %v1 = load i32, i32* %p0.1
+  %p1 = getelementptr i32, i32* %p0, i32 -2
+  call void @extfunc2(i32* %p1, i32 %v0, i32 %v1)
+  ret void
+}
+
+; CHECK-LABEL: ldrd_postupdate_inc:
+; CHECK: ldrd r1, r2, [r0], #8
+; CHECK-NEXT: bl{{x?}} _extfunc
+define void @ldrd_postupdate_inc(i32* %p0) {
+  %p0.1 = getelementptr i32, i32* %p0, i32 1
+  %v0 = load i32, i32* %p0
+  %v1 = load i32, i32* %p0.1
+  %p1 = getelementptr i32, i32* %p0, i32 2
+  call void @extfunc2(i32* %p1, i32 %v0, i32 %v1)
+  ret void
+}
+
+; CHECK-LABEL: strd_postupdate_dec:
+; CHECK: strd r1, r2, [r0], #-8
+; CHECK-NEXT: bx lr
+define i32* @strd_postupdate_dec(i32* %p0, i32 %v0, i32 %v1) {
+  %p0.1 = getelementptr i32, i32* %p0, i32 1
+  store i32 %v0, i32* %p0
+  store i32 %v1, i32* %p0.1
+  %p1 = getelementptr i32, i32* %p0, i32 -2
+  ret i32* %p1
+}
+
+; CHECK-LABEL: strd_postupdate_inc:
+; CHECK: strd r1, r2, [r0], #8
+; CHECK-NEXT: bx lr
+define i32* @strd_postupdate_inc(i32* %p0, i32 %v0, i32 %v1) {
+  %p0.1 = getelementptr i32, i32* %p0, i32 1
+  store i32 %v0, i32* %p0
+  store i32 %v1, i32* %p0.1
+  %p1 = getelementptr i32, i32* %p0, i32 2
+  ret i32* %p1
+}
+
 declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll
index 191db1e20a25..f6f8d5623509 100644
--- a/test/CodeGen/ARM/memset-inline.ll
+++ b/test/CodeGen/ARM/memset-inline.ll
@@ -4,8 +4,7 @@ define void @t1(i8* nocapture %c) nounwind optsize {
 entry:
 ; CHECK-LABEL: t1:
 ; CHECK: movs r1, #0
-; CHECK: str r1, [r0]
-; CHECK: str r1, [r0, #4]
+; CHECK: strd r1, r1, [r0]
 ; CHECK: str r1, [r0, #8]
   call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
   ret void
diff --git a/test/CodeGen/ARM/nest-register.ll b/test/CodeGen/ARM/nest-register.ll
new file mode 100644
index 000000000000..6b8c3dc47db1
--- /dev/null
+++ b/test/CodeGen/ARM/nest-register.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+
+; Tests that the 'nest' parameter attribute causes the relevant parameter to be
+; passed in the right register.
+
+define i8* @nest_receiver(i8* nest %arg) nounwind {
+; CHECK-LABEL: nest_receiver:
+; CHECK: @ BB#0:
+; CHECK-NEXT: mov r0, r12
+; CHECK-NEXT: mov pc, lr
+        ret i8* %arg
+}
+
+define i8* @nest_caller(i8* %arg) nounwind {
+; CHECK-LABEL: nest_caller:
+; CHECK: mov r12, r0
+; CHECK-NEXT: bl nest_receiver
+; CHECK: mov pc, lr
+        %result = call i8* @nest_receiver(i8* nest %arg)
+        ret i8* %result
+}
diff --git a/test/CodeGen/ARM/subtarget-features-long-calls.ll b/test/CodeGen/ARM/subtarget-features-long-calls.ll
new file mode 100644
index 000000000000..430ae3d13307
--- /dev/null
+++ b/test/CodeGen/ARM/subtarget-features-long-calls.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - | FileCheck -check-prefix=NO-OPTION %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -mattr=+long-calls | FileCheck -check-prefix=LONGCALL %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -mattr=-long-calls | FileCheck -check-prefix=NO-LONGCALL %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -O0 | FileCheck -check-prefix=NO-OPTION %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -O0 -mattr=+long-calls | FileCheck -check-prefix=LONGCALL %s
+; RUN: llc -march thumb -mcpu=cortex-a8 -relocation-model=static %s -o - -O0 -mattr=-long-calls | FileCheck -check-prefix=NO-LONGCALL %s
+
+; NO-OPTION-LABEL: {{_?}}caller0
+; NO-OPTION: ldr [[R0:r[0-9]+]], [[L0:.*]] 
+; NO-OPTION: blx [[R0]]
+; NO-OPTION: [[L0]]:
+; NO-OPTION: .long {{_?}}callee0
+
+; LONGCALL-LABEL: {{_?}}caller0
+; LONGCALL: ldr [[R0:r[0-9]+]], [[L0:.*]]
+; LONGCALL: blx [[R0]]
+; LONGCALL: [[L0]]:
+; LONGCALL: .long {{_?}}callee0
+
+; NO-LONGCALL-LABEL: {{_?}}caller0
+; NO-LONGCALL: bl {{_?}}callee0
+
+define i32 @caller0() #0 {
+entry:
+  tail call void @callee0()
+  ret i32 0
+}
+
+; NO-OPTION-LABEL: {{_?}}caller1
+; NO-OPTION: bl {{_?}}callee0
+
+; LONGCALL-LABEL: {{_?}}caller1
+; LONGCALL: ldr [[R0:r[0-9]+]], [[L0:.*]]
+; LONGCALL: blx [[R0]]
+; LONGCALL: [[L0]]:
+; LONGCALL: .long {{_?}}callee0
+
+; NO-LONGCALL-LABEL: {{_?}}caller1
+; NO-LONGCALL: bl {{_?}}callee0
+
+define i32 @caller1() {
+entry:
+  tail call void @callee0()
+  ret i32 0
+}
+
+declare void @callee0()
+
+attributes #0 = { "target-features"="+long-calls" }
diff --git a/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll b/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll
index 96c5fb8961ef..fe335df7a1ad 100644
--- a/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll
+++ b/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll
@@ -5,16 +5,20 @@ target triple = "thumbv7--linux-gnueabi"
 
 declare i8* @llvm.returnaddress(i32)
 
-define i32* @wrong-t2stmia-size-reduction(i32* %addr, i32 %val0) minsize {
+define i32* @wrong-t2stmia-size-reduction(i32* %addr, i32 %val0, i32 %val1) minsize {
   store i32 %val0, i32* %addr
   %addr1 = getelementptr i32, i32* %addr, i32 1
+  %addr2 = getelementptr i32, i32* %addr, i32 2
   %lr = call i8* @llvm.returnaddress(i32 0)
   %lr32 = ptrtoint i8* %lr to i32
-  store i32 %lr32, i32* %addr1
-  %addr2 = getelementptr i32, i32* %addr1, i32 1
-  ret i32* %addr2
+  store i32 %val1, i32* %addr1
+  store i32 %lr32, i32* %addr2
+
+  %addr3 = getelementptr i32, i32* %addr, i32 3
+  ret i32* %addr3
 }
 
-; Check that stm writes two registers.  The bug caused one of registers (LR,
+; Check that stm writes three registers.  The bug caused one of registers (LR,
 ; which invalid for Thumb1 form of STMIA instruction) to be dropped.
-; CHECK: stm{{[^,]*}}, {{{.*,.*}}}
+; CHECK-LABEL: wrong-t2stmia-size-reduction:
+; CHECK: stm{{[^,]*}}, {{{.*,.*,.*}}}
diff --git a/test/CodeGen/Generic/run-pass.ll b/test/CodeGen/Generic/run-pass.ll
new file mode 100644
index 000000000000..55d62ec18648
--- /dev/null
+++ b/test/CodeGen/Generic/run-pass.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -debug-pass=Structure -run-pass=gc-lowering -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: -gc-lowering
+; CHECK: FunctionPass Manager
+; CHECK-NEXT: Lower Garbage Collection Instructions
+; CHECK-NEXT: Machine Function Analysis
+; CHECK-NEXT: MIR Printing Pass
diff --git a/test/CodeGen/Hexagon/Atomics.ll b/test/CodeGen/Hexagon/Atomics.ll
new file mode 100644
index 000000000000..bbac5d73c868
--- /dev/null
+++ b/test/CodeGen/Hexagon/Atomics.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -march=hexagon
+
+@si = common global i32 0, align 4
+@sll = common global i64 0, align 8
+
+define void @test_op_ignore() nounwind {
+entry:
+  %t00 = atomicrmw add i32* @si, i32 1 monotonic
+  %t01 = atomicrmw add i64* @sll, i64 1 monotonic
+  %t10 = atomicrmw sub i32* @si, i32 1 monotonic
+  %t11 = atomicrmw sub i64* @sll, i64 1 monotonic
+  %t20 = atomicrmw or i32* @si, i32 1 monotonic
+  %t21 = atomicrmw or i64* @sll, i64 1 monotonic
+  %t30 = atomicrmw xor i32* @si, i32 1 monotonic
+  %t31 = atomicrmw xor i64* @sll, i64 1 monotonic
+  %t40 = atomicrmw and i32* @si, i32 1 monotonic
+  %t41 = atomicrmw and i64* @sll, i64 1 monotonic
+  %t50 = atomicrmw nand i32* @si, i32 1 monotonic
+  %t51 = atomicrmw nand i64* @sll, i64 1 monotonic
+  br label %return
+
+return:                                           ; preds = %entry
+  ret void
+}
+
+define void @test_fetch_and_op() nounwind {
+entry:
+  %t00 = atomicrmw add i32* @si, i32 11 monotonic
+  store i32 %t00, i32* @si, align 4
+  %t01 = atomicrmw add i64* @sll, i64 11 monotonic
+  store i64 %t01, i64* @sll, align 8
+  %t10 = atomicrmw sub i32* @si, i32 11 monotonic
+  store i32 %t10, i32* @si, align 4
+  %t11 = atomicrmw sub i64* @sll, i64 11 monotonic
+  store i64 %t11, i64* @sll, align 8
+  %t20 = atomicrmw or i32* @si, i32 11 monotonic
+  store i32 %t20, i32* @si, align 4
+  %t21 = atomicrmw or i64* @sll, i64 11 monotonic
+  store i64 %t21, i64* @sll, align 8
+  %t30 = atomicrmw xor i32* @si, i32 11 monotonic
+  store i32 %t30, i32* @si, align 4
+  %t31 = atomicrmw xor i64* @sll, i64 11 monotonic
+  store i64 %t31, i64* @sll, align 8
+  %t40 = atomicrmw and i32* @si, i32 11 monotonic
+  store i32 %t40, i32* @si, align 4
+  %t41 = atomicrmw and i64* @sll, i64 11 monotonic
+  store i64 %t41, i64* @sll, align 8
+  %t50 = atomicrmw nand i32* @si, i32 11 monotonic
+  store i32 %t50, i32* @si, align 4
+  %t51 = atomicrmw nand i64* @sll, i64 11 monotonic
+  store i64 %t51, i64* @sll, align 8
+  br label %return
+
+return:                                           ; preds = %entry
+  ret void
+}
+
+define void @test_lock() nounwind {
+entry:
+  %t00 = atomicrmw xchg i32* @si, i32 1 monotonic
+  store i32 %t00, i32* @si, align 4
+  %t01 = atomicrmw xchg i64* @sll, i64 1 monotonic
+  store i64 %t01, i64* @sll, align 8
+  fence seq_cst
+  store volatile i32 0, i32* @si, align 4
+  store volatile i64 0, i64* @sll, align 8
+  br label %return
+
+return:                                           ; preds = %entry
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/common-gep-basic.ll b/test/CodeGen/Hexagon/common-gep-basic.ll
new file mode 100644
index 000000000000..317bf868d0f8
--- /dev/null
+++ b/test/CodeGen/Hexagon/common-gep-basic.ll
@@ -0,0 +1,37 @@
+; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
+; CHECK: mpyi
+; CHECK-NOT: mpyi
+; The mpyis from the two GEPs should be commoned out.
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon-unknown--elf"
+
+%struct.s_t = type { %struct.anon, i32 }
+%struct.anon = type { i32, [5 x i32] }
+
+@g = common global [100 x %struct.s_t] zeroinitializer, align 8
+
+; Function Attrs: nounwind
+define void @foo(i32 %x) #0 {
+entry:
+  %cmp = icmp slt i32 %x, 90
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds [100 x %struct.s_t], [100 x %struct.s_t]* @g, i32 0, i32 %x, i32 0, i32 1, i32 2
+  tail call void @bar(i32* %arrayidx1) #0
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx5 = getelementptr inbounds [100 x %struct.s_t], [100 x %struct.s_t]* @g, i32 0, i32 %x, i32 0, i32 1, i32 3
+  tail call void @bar(i32* %arrayidx5) #0
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare void @bar(i32*) #0
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/Hexagon/common-gep-icm.ll b/test/CodeGen/Hexagon/common-gep-icm.ll
new file mode 100644
index 000000000000..bc5719dfe1d0
--- /dev/null
+++ b/test/CodeGen/Hexagon/common-gep-icm.ll
@@ -0,0 +1,76 @@
+; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
+; Rely on the comments generated by llc. Make sure there are no add/addasl
+; instructions in while.body13 (before the loads).
+; CHECK: while.body13
+; CHECK-NOT: add
+; CHECK: memw
+
+%struct.1 = type { i32, i32 }
+%struct.2 = type { [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [3 x i32], [24 x i32], [8 x %struct.1], [5 x i32] }
+
+@A1 = global i64 zeroinitializer
+@A2 = global i64 zeroinitializer
+@B1 = global i32 zeroinitializer
+@B2 = global i32 zeroinitializer
+@C1 = global i8 zeroinitializer
+
+declare i32 @llvm.hexagon.S2.cl0(i32) nounwind readnone
+declare i32 @llvm.hexagon.S2.setbit.r(i32, i32) nounwind readnone
+declare i64 @llvm.hexagon.M2.vmpy2s.s0(i32, i32) nounwind readnone
+declare i64 @llvm.hexagon.M2.vmac2s.s0(i64, i32, i32) nounwind readnone
+declare i64 @llvm.hexagon.A2.vaddws(i64, i64) nounwind readnone
+declare i64 @llvm.hexagon.A2.vsubws(i64, i64) nounwind readnone
+declare i32 @llvm.hexagon.A4.modwrapu(i32, i32) nounwind readnone
+
+define void @foo(i32 %n) nounwind {
+entry:
+  br label %while.body
+
+while.body:
+  %count = phi i32 [ 0, %entry ], [ %next, %while.end ]
+  %idx = phi i32 [ 0, %entry ], [ %15, %while.end ]
+  %0 = load i32, i32* @B1, align 4
+  %1 = load i32, i32* @B2, align 8
+  %2 = and i32 %1, %0
+  br label %while.body13
+
+while.body13:                                     ; preds = %while.body, %if.end
+  %3 = phi i64 [ %13, %if.end ], [ 0, %while.body ]
+  %4 = phi i64 [ %14, %if.end ], [ 0, %while.body ]
+  %m = phi i32 [ %6, %if.end ], [ %2, %while.body ]
+  %5 = tail call i32 @llvm.hexagon.S2.cl0(i32 %m)
+  %6 = tail call i32 @llvm.hexagon.S2.setbit.r(i32 %m, i32 %5)
+  %cgep85 = getelementptr [10 x %struct.2], [10 x %struct.2]* inttoptr (i32 -121502345 to [10 x %struct.2]*), i32 0, i32 %idx
+  %cgep90 = getelementptr %struct.2, %struct.2* %cgep85, i32 0, i32 12, i32 %5
+  %7 = load i32, i32* %cgep90, align 4
+  %8 = tail call i64 @llvm.hexagon.M2.vmpy2s.s0(i32 %7, i32 %7)
+  %cgep91 = getelementptr %struct.2, %struct.2* %cgep85, i32 0, i32 13, i32 %5
+  %9 = load i32, i32* %cgep91, align 4
+  %10 = tail call i64 @llvm.hexagon.M2.vmac2s.s0(i64 %8, i32 %9, i32 %9)
+  %11 = load i8, i8* @C1, align 1
+  %and24 = and i8 %11, 1
+  %cmp = icmp eq i8 %and24, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %while.body13
+  %12 = tail call i64 @llvm.hexagon.A2.vaddws(i64 %3, i64 %10)
+  store i64 %12, i64* @A1, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %while.body13
+  %13 = phi i64 [ %12, %if.then ], [ %3, %while.body13 ]
+  %14 = tail call i64 @llvm.hexagon.A2.vsubws(i64 %4, i64 %10)
+  %tobool12 = icmp eq i32 %6, 0
+  br i1 %tobool12, label %while.end, label %while.body13
+
+while.end:
+  %add40 = add i32 %idx, 1
+  %15 = tail call i32 @llvm.hexagon.A4.modwrapu(i32 %add40, i32 10) nounwind
+  %next = add i32 %count, 1
+  %cc = icmp eq i32 %next, %n
+  br i1 %cc, label %end, label %while.body
+
+end:
+  store i64 %10, i64* @A2, align 8
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/extract-basic.ll b/test/CodeGen/Hexagon/extract-basic.ll
new file mode 100644
index 000000000000..c75125cedd35
--- /dev/null
+++ b/test/CodeGen/Hexagon/extract-basic.ll
@@ -0,0 +1,76 @@
+; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
+
+; CHECK-DAG: extractu(r{{[0-9]*}}, #3, #4)
+; CHECK-DAG: extractu(r{{[0-9]*}}, #8, #7)
+; CHECK-DAG: extractu(r{{[0-9]*}}, #8, #16)
+
+; C source:
+; typedef struct {
+;   unsigned x1:3;
+;   unsigned x2:7;
+;   unsigned x3:8;
+;   unsigned x4:12;
+;   unsigned x5:2;
+; } structx_t;
+;
+; typedef struct {
+;   unsigned y1:4;
+;   unsigned y2:3;
+;   unsigned y3:9;
+;   unsigned y4:8;
+;   unsigned y5:8;
+; } structy_t;
+;
+; void foo(structx_t *px, structy_t *py) {
+;   px->x1 = py->y1;
+;   px->x2 = py->y2;
+;   px->x3 = py->y3;
+;   px->x4 = py->y4;
+;   px->x5 = py->y5;
+; }
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+%struct.structx_t = type { i8, i8, i8, i8 }
+%struct.structy_t = type { i8, i8, i8, i8 }
+
+define void @foo(%struct.structx_t* nocapture %px, %struct.structy_t* nocapture %py) nounwind {
+entry:
+  %0 = bitcast %struct.structy_t* %py to i32*
+  %1 = load i32, i32* %0, align 4
+  %bf.value = and i32 %1, 7
+  %2 = bitcast %struct.structx_t* %px to i32*
+  %3 = load i32, i32* %2, align 4
+  %4 = and i32 %3, -8
+  %5 = or i32 %4, %bf.value
+  store i32 %5, i32* %2, align 4
+  %6 = load i32, i32* %0, align 4
+  %7 = lshr i32 %6, 4
+  %bf.clear1 = shl nuw nsw i32 %7, 3
+  %8 = and i32 %bf.clear1, 56
+  %9 = and i32 %5, -1017
+  %10 = or i32 %8, %9
+  store i32 %10, i32* %2, align 4
+  %11 = load i32, i32* %0, align 4
+  %12 = lshr i32 %11, 7
+  %bf.value4 = shl i32 %12, 10
+  %13 = and i32 %bf.value4, 261120
+  %14 = and i32 %10, -262081
+  %15 = or i32 %14, %13
+  store i32 %15, i32* %2, align 4
+  %16 = load i32, i32* %0, align 4
+  %17 = lshr i32 %16, 16
+  %bf.clear5 = shl i32 %17, 18
+  %18 = and i32 %bf.clear5, 66846720
+  %19 = and i32 %15, -1073480641
+  %20 = or i32 %19, %18
+  store i32 %20, i32* %2, align 4
+  %21 = load i32, i32* %0, align 4
+  %22 = lshr i32 %21, 24
+  %23 = shl i32 %22, 30
+  %24 = and i32 %20, 67107903
+  %25 = or i32 %24, %23
+  store i32 %25, i32* %2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/fusedandshift.ll b/test/CodeGen/Hexagon/fusedandshift.ll
index 59a1e1d84fcc..414574aec401 100644
--- a/test/CodeGen/Hexagon/fusedandshift.ll
+++ b/test/CodeGen/Hexagon/fusedandshift.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4  < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-extract=0 < %s | FileCheck %s
 ; Check that we generate fused logical and with shift instruction.
+; Disable "extract" generation, since it may eliminate the and/lsr.
 
 ; CHECK: r{{[0-9]+}} = and(#15, lsr(r{{[0-9]+}}, #{{[0-9]+}})
 
diff --git a/test/CodeGen/Hexagon/insert-basic.ll b/test/CodeGen/Hexagon/insert-basic.ll
new file mode 100644
index 000000000000..e941c063d9ed
--- /dev/null
+++ b/test/CodeGen/Hexagon/insert-basic.ll
@@ -0,0 +1,66 @@
+; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
+; CHECK-DAG: insert(r{{[0-9]*}}, #17, #0)
+; CHECK-DAG: insert(r{{[0-9]*}}, #18, #0)
+; CHECK-DAG: insert(r{{[0-9]*}}, #22, #0)
+; CHECK-DAG: insert(r{{[0-9]*}}, #12, #0)
+
+; C source:
+; typedef struct {
+;   unsigned x1:23;
+;   unsigned x2:17;
+;   unsigned x3:18;
+;   unsigned x4:22;
+;   unsigned x5:12;
+; } structx_t;
+;
+; void foo(structx_t *px, int y1, int y2, int y3, int y4, int y5) {
+;   px->x1 = y1;
+;   px->x2 = y2;
+;   px->x3 = y3;
+;   px->x4 = y4;
+;   px->x5 = y5;
+; }
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+%struct.structx_t = type { [3 x i8], i8, [3 x i8], i8, [3 x i8], i8, [3 x i8], i8, [2 x i8], [2 x i8] }
+
+define void @foo(%struct.structx_t* nocapture %px, i32 %y1, i32 %y2, i32 %y3, i32 %y4, i32 %y5) nounwind {
+entry:
+  %bf.value = and i32 %y1, 8388607
+  %0 = bitcast %struct.structx_t* %px to i32*
+  %1 = load i32, i32* %0, align 4
+  %2 = and i32 %1, -8388608
+  %3 = or i32 %2, %bf.value
+  store i32 %3, i32* %0, align 4
+  %bf.value1 = and i32 %y2, 131071
+  %bf.field.offs = getelementptr %struct.structx_t, %struct.structx_t* %px, i32 0, i32 0, i32 4
+  %4 = bitcast i8* %bf.field.offs to i32*
+  %5 = load i32, i32* %4, align 4
+  %6 = and i32 %5, -131072
+  %7 = or i32 %6, %bf.value1
+  store i32 %7, i32* %4, align 4
+  %bf.value2 = and i32 %y3, 262143
+  %bf.field.offs3 = getelementptr %struct.structx_t, %struct.structx_t* %px, i32 0, i32 0, i32 8
+  %8 = bitcast i8* %bf.field.offs3 to i32*
+  %9 = load i32, i32* %8, align 4
+  %10 = and i32 %9, -262144
+  %11 = or i32 %10, %bf.value2
+  store i32 %11, i32* %8, align 4
+  %bf.value4 = and i32 %y4, 4194303
+  %bf.field.offs5 = getelementptr %struct.structx_t, %struct.structx_t* %px, i32 0, i32 0, i32 12
+  %12 = bitcast i8* %bf.field.offs5 to i32*
+  %13 = load i32, i32* %12, align 4
+  %14 = and i32 %13, -4194304
+  %15 = or i32 %14, %bf.value4
+  store i32 %15, i32* %12, align 4
+  %bf.value6 = and i32 %y5, 4095
+  %bf.field.offs7 = getelementptr %struct.structx_t, %struct.structx_t* %px, i32 0, i32 0, i32 16
+  %16 = bitcast i8* %bf.field.offs7 to i32*
+  %17 = load i32, i32* %16, align 4
+  %18 = and i32 %17, -4096
+  %19 = or i32 %18, %bf.value6
+  store i32 %19, i32* %16, align 4
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/predicate-logical.ll b/test/CodeGen/Hexagon/predicate-logical.ll
new file mode 100644
index 000000000000..be2bcb03d6a1
--- /dev/null
+++ b/test/CodeGen/Hexagon/predicate-logical.ll
@@ -0,0 +1,30 @@
+; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
+; CHECK: p{{[0-9]}} = or(p{{[0-9]}}, and(p{{[0-9]}}, p{{[0-9]}}))
+
+target triple = "hexagon"
+
+define i32 @foo(i64* nocapture %p, i64* nocapture %q) nounwind readonly {
+entry:
+  %incdec.ptr = getelementptr inbounds i64, i64* %p, i32 1
+  %0 = load i64, i64* %p, align 8, !tbaa !0
+  %incdec.ptr1 = getelementptr inbounds i64, i64* %q, i32 1
+  %1 = load i64, i64* %q, align 8, !tbaa !0
+  %2 = tail call i32 @llvm.hexagon.A2.vcmpwgtu(i64 %0, i64 %1)
+  %incdec.ptr2 = getelementptr inbounds i64, i64* %p, i32 2
+  %3 = load i64, i64* %incdec.ptr, align 8, !tbaa !0
+  %incdec.ptr3 = getelementptr inbounds i64, i64* %q, i32 2
+  %4 = load i64, i64* %incdec.ptr1, align 8, !tbaa !0
+  %5 = tail call i32 @llvm.hexagon.A2.vcmpwgtu(i64 %3, i64 %4)
+  %6 = load i64, i64* %incdec.ptr2, align 8, !tbaa !0
+  %7 = load i64, i64* %incdec.ptr3, align 8, !tbaa !0
+  %8 = tail call i32 @llvm.hexagon.A2.vcmpwgtu(i64 %6, i64 %7)
+  %and = and i32 %5, %2
+  %or = or i32 %8, %and
+  ret i32 %or
+}
+
+declare i32 @llvm.hexagon.A2.vcmpwgtu(i64, i64) nounwind readnone
+
+!0 = !{!"long long", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/predicate-rcmp.ll b/test/CodeGen/Hexagon/predicate-rcmp.ll
new file mode 100644
index 000000000000..45daa88d7161
--- /dev/null
+++ b/test/CodeGen/Hexagon/predicate-rcmp.ll
@@ -0,0 +1,19 @@
+; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
+; CHECK: cmp.eq(r{{[0-9]+}}, #0)
+; Check that the result of the builtin is not stored directly, i.e. that
+; there is an instruction that converts it to {0,1} from {0,-1}. Right now
+; the instruction is "r4 = !cmp.eq(r0, #0)".
+
+@var = common global i32 0, align 4
+declare i32 @llvm.hexagon.C2.cmpgtup(i64,i64) nounwind
+
+define void @foo(i64 %a98, i64 %a100) nounwind {
+entry:
+  %a101 = tail call i32 @llvm.hexagon.C2.cmpgtup(i64 %a98, i64 %a100)
+  %tobool250 = icmp eq i32 %a101, 0
+  %a102 = zext i1 %tobool250 to i8
+  %detected.0 = xor i8 %a102, 1
+  %conv253 = zext i8 %detected.0 to i32
+  store i32 %conv253, i32* @var, align 4
+  ret void
+}
diff --git a/test/CodeGen/MIR/X86/basic-block-liveins.mir b/test/CodeGen/MIR/X86/basic-block-liveins.mir
new file mode 100644
index 000000000000..d749a0524422
--- /dev/null
+++ b/test/CodeGen/MIR/X86/basic-block-liveins.mir
@@ -0,0 +1,25 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses basic block liveins correctly.
+
+--- |
+
+  define i32 @test(i32 %a, i32 %b) {
+  body:
+    %c = add i32 %a, %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+body:
+  # CHECK: name: body
+  # CHECK: liveins: [ '%edi', '%esi' ]
+  # CHECK-NEXT: instructions:
+  - id:          0
+    name:        body
+    liveins:     [ '%edi', '%esi' ]
+    instructions:
+      - '%eax = LEA64_32r killed %rdi, 1, killed %rsi, 0, _'
+      - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/dead-register-flag.mir b/test/CodeGen/MIR/X86/dead-register-flag.mir
new file mode 100644
index 000000000000..988b554659cb
--- /dev/null
+++ b/test/CodeGen/MIR/X86/dead-register-flag.mir
@@ -0,0 +1,26 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the 'dead' register flags
+# correctly.
+
+--- |
+
+  define i32 @foo(i32 %a) #0 {
+  body:
+    %c = mul i32 %a, 11
+    ret i32 %c
+  }
+
+  attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name:            foo
+body:
+  # CHECK: name: body
+  - id:          0
+    name:        body
+    instructions:
+      # CHECK: - '%eax = IMUL32rri8 %edi, 11, implicit-def dead %eflags'
+      - '%eax = IMUL32rri8 %edi, 11, implicit-def dead %eflags'
+      - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir b/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir
new file mode 100644
index 000000000000..c5f5aaca34e0
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir
@@ -0,0 +1,38 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @foo(i32* %p) {
+  entry:
+    %a = load i32, i32* %p
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+
+...
+---
+name:            foo
+body:
+ - id:              0
+   name:            entry
+   instructions:
+     - '%eax = MOV32rm %rdi, 1, _, 0, _'
+     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
+# CHECK: [[@LINE+1]]:26: expected an implicit register operand 'implicit %eflags'
+     - 'JG_1 %bb.2.exit, implicit %eax'
+ - id:              1
+   name:            less
+   instructions:
+     - '%eax = MOV32r0 implicit-def %eflags'
+ - id:              2
+   name:            exit
+   instructions:
+     - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir b/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir
new file mode 100644
index 000000000000..ecf3a122bf66
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir
@@ -0,0 +1,38 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @foo(i32* %p) {
+  entry:
+    %a = load i32, i32* %p
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+
+...
+---
+name:            foo
+body:
+ - id:              0
+   name:            entry
+   instructions:
+     - '%eax = MOV32rm %rdi, 1, _, 0, _'
+     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
+# CHECK: [[@LINE+1]]:26: expected an implicit register operand 'implicit %eflags'
+     - 'JG_1 %bb.2.exit, implicit-def %eflags'
+ - id:              1
+   name:            less
+   instructions:
+     - '%eax = MOV32r0 implicit-def %eflags'
+ - id:              2
+   name:            exit
+   instructions:
+     - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/expected-named-register-livein.mir b/test/CodeGen/MIR/X86/expected-named-register-livein.mir
new file mode 100644
index 000000000000..1fbe881c8c70
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-named-register-livein.mir
@@ -0,0 +1,21 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32 %a) {
+  body:
+    ret i32 %a
+  }
+
+...
+---
+name:            test
+body:
+  - id:          0
+    name:        body
+    # CHECK: [[@LINE+1]]:21: expected a named register
+    liveins:     [ '%0' ]
+    instructions:
+      - '%eax = COPY %edi'
+      - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/expected-number-after-bb.mir b/test/CodeGen/MIR/X86/expected-number-after-bb.mir
index f4248a76be46..5343a847fbb9 100644
--- a/test/CodeGen/MIR/X86/expected-number-after-bb.mir
+++ b/test/CodeGen/MIR/X86/expected-number-after-bb.mir
@@ -23,13 +23,13 @@ body:
    name:   entry
    instructions:
      - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     - 'CMP32ri8 %eax, 10'
+     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
      # CHECK: [[@LINE+1]]:18: expected a number after '%bb.'
-     - 'JG_1 %bb.nah'
+     - 'JG_1 %bb.nah, implicit %eflags'
  - id: 1
    name: yes
    instructions:
-     - '%eax = MOV32r0'
+     - '%eax = MOV32r0 implicit-def %eflags'
  - id: 2
    name: nah
    instructions:
diff --git a/test/CodeGen/MIR/X86/expected-register-after-flags.mir b/test/CodeGen/MIR/X86/expected-register-after-flags.mir
new file mode 100644
index 000000000000..111f5496a378
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-register-after-flags.mir
@@ -0,0 +1,22 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that an error is reported when a register operand doesn't
+# follow register flags.
+
+--- |
+
+  define i32 @foo() {
+  entry:
+    ret i32 0
+  }
+
+...
+---
+name:            foo
+body:
+ - id:           0
+   name:         entry
+   instructions:
+     # CHECK: [[@LINE+1]]:37: expected a register after register flags
+     - '%eax = MOV32r0 implicit-def 2'
+     - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir b/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir
new file mode 100644
index 000000000000..c891a115a180
--- /dev/null
+++ b/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir
@@ -0,0 +1,29 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define zeroext i1 @t(i1 %c) {
+  entry:
+    ret i1 %c
+  }
+
+...
+---
+name:            t
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr8 }
+  - { id: 2, class: gr8 }
+body:
+  - name:        entry
+    id:          0
+    instructions:
+      - '%0 = COPY %edi'
+      # CHECK: [[@LINE+1]]:25: expected a subregister index after ':'
+      - '%1 = COPY %0 : 42'
+      - '%2 = AND8ri %1, 1, implicit-def %eflags'
+      - '%al = COPY %2'
+      - 'RETQ %al'
+...
diff --git a/test/CodeGen/MIR/X86/fixed-stack-objects.mir b/test/CodeGen/MIR/X86/fixed-stack-objects.mir
new file mode 100644
index 000000000000..dcbe6f73a6d0
--- /dev/null
+++ b/test/CodeGen/MIR/X86/fixed-stack-objects.mir
@@ -0,0 +1,35 @@
+# RUN: llc -march=x86 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses fixed stack objects correctly.
+
+--- |
+
+  define i32 @test(i32 %a) #0 {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+  attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name:            test
+frameInfo:
+  stackSize:       4
+  maxAlignment:    4
+# CHECK: fixedStack:
+# CHECK-NEXT: - { id: 0, offset: 0, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+fixedStack:
+  - { id: 0, offset: 0, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+stack:
+  - { id: 0, offset: -8, size: 4, alignment: 4 }
+body:
+  - id:          0
+    name:        entry
+    instructions:
+      - '%eax = MOV32rm %esp, 1, _, 8, _'
+      - 'MOV32mr %esp, 1, _, 0, _, %eax'
+      - 'RETL %eax'
+...
diff --git a/test/CodeGen/MIR/X86/global-value-operands.mir b/test/CodeGen/MIR/X86/global-value-operands.mir
index 4aa88fe96ceb..3ea729b00554 100644
--- a/test/CodeGen/MIR/X86/global-value-operands.mir
+++ b/test/CodeGen/MIR/X86/global-value-operands.mir
@@ -31,7 +31,7 @@ body:
       # CHECK: - '%rax = MOV64rm %rip, 1, _, @G, _'
       - '%rax = MOV64rm %rip, 1, _, @G, _'
       - '%eax = MOV32rm %rax, 1, _, 0, _'
-      - '%eax = INC32r %eax'
+      - '%eax = INC32r %eax, implicit-def %eflags'
       - 'RETQ %eax'
 ...
 ---
@@ -44,6 +44,6 @@ body:
       # CHECK: - '%rax = MOV64rm %rip, 1, _, @0, _'
       - '%rax = MOV64rm %rip, 1, _, @0, _'
       - '%eax = MOV32rm %rax, 1, _, 0, _'
-      - '%eax = INC32r %eax'
+      - '%eax = INC32r %eax, implicit-def %eflags'
       - 'RETQ %eax'
 ...
diff --git a/test/CodeGen/MIR/X86/implicit-register-flag.mir b/test/CodeGen/MIR/X86/implicit-register-flag.mir
new file mode 100644
index 000000000000..9c6882d27bdc
--- /dev/null
+++ b/test/CodeGen/MIR/X86/implicit-register-flag.mir
@@ -0,0 +1,41 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the 'implicit' and 'implicit-def'
+# register flags correctly.
+
+--- |
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+...
+---
+name:            foo
+body:
+  - id:          0
+    name:        entry
+    instructions:
+      # CHECK:      - 'CMP32ri8 %edi, 10, implicit-def %eflags'
+      # CHECK-NEXT: - 'JG_1 %bb.2.exit, implicit %eflags'
+      - 'CMP32ri8 %edi, 10, implicit-def %eflags'
+      - 'JG_1 %bb.2.exit, implicit %eflags'
+  - id:          1
+    name:        less
+    instructions:
+      # CHECK: - '%eax = MOV32r0 implicit-def %eflags'
+      - '%eax = MOV32r0 implicit-def %eflags'
+      - 'RETQ %eax'
+  - id:          2
+    name:        exit
+    instructions:
+      - '%eax = COPY %edi'
+      - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/killed-register-flag.mir b/test/CodeGen/MIR/X86/killed-register-flag.mir
new file mode 100644
index 000000000000..d654a9d2fa56
--- /dev/null
+++ b/test/CodeGen/MIR/X86/killed-register-flag.mir
@@ -0,0 +1,42 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the 'killed' register flags
+# correctly.
+
+--- |
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+...
+---
+name:            foo
+body:
+  - id:          0
+    name:        entry
+    instructions:
+      - 'CMP32ri8 %edi, 10, implicit-def %eflags'
+      - 'JG_1 %bb.2.exit, implicit %eflags'
+  - id:          1
+    name:        less
+    instructions:
+      # CHECK:      - '%eax = MOV32r0
+      # CHECK-NEXT: - 'RETQ killed %eax
+      - '%eax = MOV32r0 implicit-def %eflags'
+      - 'RETQ killed %eax'
+  - id:          2
+    name:        exit
+    instructions:
+      # CHECK:      - '%eax = COPY killed %edi
+      # CHECK-NEXT: - 'RETQ killed %eax
+      - '%eax = COPY killed %edi'
+      - 'RETQ killed %eax'
+...
diff --git a/test/CodeGen/MIR/X86/large-index-number-error.mir b/test/CodeGen/MIR/X86/large-index-number-error.mir
index 61a5bdfe2edb..fdb25c907f52 100644
--- a/test/CodeGen/MIR/X86/large-index-number-error.mir
+++ b/test/CodeGen/MIR/X86/large-index-number-error.mir
@@ -23,12 +23,12 @@ body:
    name: entry
    instructions:
      - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     - 'CMP32ri8 %eax, 10'
+     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
      # CHECK: [[@LINE+1]]:14: expected 32-bit integer (too large)
-     - 'JG_1 %bb.123456789123456'
+     - 'JG_1 %bb.123456789123456, implicit %eflags'
  - id: 1
    instructions:
-     - '%eax = MOV32r0'
+     - '%eax = MOV32r0 implicit-def %eflags'
  - id: 2
    instructions:
      - 'RETQ %eax'
diff --git a/test/CodeGen/MIR/X86/machine-basic-block-operands.mir b/test/CodeGen/MIR/X86/machine-basic-block-operands.mir
index 9d1bd0bd58ad..607acb5f273e 100644
--- a/test/CodeGen/MIR/X86/machine-basic-block-operands.mir
+++ b/test/CodeGen/MIR/X86/machine-basic-block-operands.mir
@@ -41,13 +41,13 @@ body:
      - '%eax = MOV32rm %rdi, 1, _, 0, _'
      # CHECK:      - 'CMP32ri8 %eax, 10
      # CHECK-NEXT: - 'JG_1 %bb.2.exit
-     - 'CMP32ri8 %eax, 10'
-     - 'JG_1 %bb.2.exit'
+     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
+     - 'JG_1 %bb.2.exit, implicit %eflags'
  # CHECK: name: less
  - id:              1
    name:            less
    instructions:
-     - '%eax = MOV32r0'
+     - '%eax = MOV32r0 implicit-def %eflags'
  - id:              2
    name:            exit
    instructions:
@@ -64,11 +64,11 @@ body:
      - '%eax = MOV32rm %rdi, 1, _, 0, _'
      # CHECK:      - 'CMP32ri8 %eax, 10
      # CHECK-NEXT: - 'JG_1 %bb.2
-     - 'CMP32ri8 %eax, 10'
-     - 'JG_1 %bb.3'
+     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
+     - 'JG_1 %bb.3, implicit %eflags'
  - id: 1
    instructions:
-     - '%eax = MOV32r0'
+     - '%eax = MOV32r0 implicit-def %eflags'
  - id: 3
    instructions:
      - 'RETQ %eax'
diff --git a/test/CodeGen/MIR/X86/machine-instructions.mir b/test/CodeGen/MIR/X86/machine-instructions.mir
index b743198cf270..08f3d76486b1 100644
--- a/test/CodeGen/MIR/X86/machine-instructions.mir
+++ b/test/CodeGen/MIR/X86/machine-instructions.mir
@@ -18,8 +18,8 @@ body:
  - id:           0
    name:         entry
    instructions:
-     # CHECK:      - IMUL32rri8
+     # CHECK:      - MOV32rr
      # CHECK-NEXT: - RETQ
-     - IMUL32rri8
+     - MOV32rr
      - ' RETQ '
 ...
diff --git a/test/CodeGen/MIR/X86/missing-implicit-operand.mir b/test/CodeGen/MIR/X86/missing-implicit-operand.mir
new file mode 100644
index 000000000000..4d2cd03f4a3d
--- /dev/null
+++ b/test/CodeGen/MIR/X86/missing-implicit-operand.mir
@@ -0,0 +1,40 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser reports an error when an instruction
+# is missing one of its implicit register operands.
+
+--- |
+
+  define i32 @foo(i32* %p) {
+  entry:
+    %a = load i32, i32* %p
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+
+...
+---
+name:            foo
+body:
+ - id:              0
+   name:            entry
+   instructions:
+     - '%eax = MOV32rm %rdi, 1, _, 0, _'
+     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
+# CHECK: [[@LINE+1]]:24: missing implicit register operand 'implicit %eflags'
+     - 'JG_1 %bb.2.exit'
+ - id:              1
+   name:            less
+   instructions:
+     - '%eax = MOV32r0 implicit-def %eflags'
+ - id:              2
+   name:            exit
+   instructions:
+     - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/named-registers.mir b/test/CodeGen/MIR/X86/named-registers.mir
index 5defb8489e1e..91ed48568678 100644
--- a/test/CodeGen/MIR/X86/named-registers.mir
+++ b/test/CodeGen/MIR/X86/named-registers.mir
@@ -18,6 +18,6 @@ body:
    instructions:
      # CHECK:      - '%eax = MOV32r0
      # CHECK-NEXT: - 'RETQ %eax
-     - '%eax = MOV32r0'
+     - '%eax = MOV32r0 implicit-def %eflags'
      - 'RETQ %eax'
 ...
diff --git a/test/CodeGen/MIR/X86/register-mask-operands.mir b/test/CodeGen/MIR/X86/register-mask-operands.mir
index ecaedeae4dbd..f4136598ff5c 100644
--- a/test/CodeGen/MIR/X86/register-mask-operands.mir
+++ b/test/CodeGen/MIR/X86/register-mask-operands.mir
@@ -24,7 +24,7 @@ body:
   - id:          0
     name:        body
     instructions:
-      - '%eax = IMUL32rri8 %edi, 11'
+      - '%eax = IMUL32rri8 %edi, 11, implicit-def %eflags'
       - 'RETQ %eax'
 ...
 ---
@@ -35,9 +35,9 @@ body:
     name:        entry
     instructions:
       # CHECK:      - 'PUSH64r %rax
-      # CHECK-NEXT: - 'CALL64pcrel32 @compute, csr_64, %rsp, %edi, %rsp, %eax'
-      - 'PUSH64r %rax'
-      - 'CALL64pcrel32 @compute, csr_64, %rsp, %edi, %rsp, %eax'
-      - '%rdx = POP64r'
+      # CHECK-NEXT: - 'CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax'
+      - 'PUSH64r %rax, implicit-def %rsp, implicit %rsp'
+      - 'CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax'
+      - '%rdx = POP64r implicit-def %rsp, implicit %rsp'
       - 'RETQ %eax'
 ...
diff --git a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir
new file mode 100644
index 000000000000..67f4bd21cd05
--- /dev/null
+++ b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir
@@ -0,0 +1,32 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32 %a) #0 {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+  attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name:            test
+frameInfo:
+  maxAlignment:    4
+fixedStack:
+  # CHECK: [[@LINE+1]]:63: unknown key 'isAliased'
+  - { id: 0, type: spill-slot, offset: 0, size: 4, isAliased: true }
+stack:
+  - { id: 0, offset: -12, size: 4, alignment: 4 }
+body:
+  - id:          0
+    name:        entry
+    instructions:
+      - 'MOV32mr %rsp, 1, _, -4, _, %edi'
+      - '%eax = COPY %edi'
+      - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir
new file mode 100644
index 000000000000..1e1b0fdcc8dc
--- /dev/null
+++ b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir
@@ -0,0 +1,32 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+  define i32 @test(i32 %a) #0 {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+  attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name:            test
+frameInfo:
+  maxAlignment:    4
+fixedStack:
+  # CHECK: [[@LINE+1]]:65: unknown key 'isImmutable'
+  - { id: 0, type: spill-slot, offset: 0, size: 4, isImmutable: true }
+stack:
+  - { id: 0, offset: -12, size: 4, alignment: 4 }
+body:
+  - id:          0
+    name:        entry
+    instructions:
+      - 'MOV32mr %rsp, 1, _, -4, _, %edi'
+      - '%eax = COPY %edi'
+      - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir
new file mode 100644
index 000000000000..f771f796ec34
--- /dev/null
+++ b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir
@@ -0,0 +1,34 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses fixed stack objects correctly.
+
+--- |
+
+  define i32 @test(i32 %a) #0 {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+  attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name:            test
+frameInfo:
+  maxAlignment:    4
+# CHECK: fixedStack:
+# CHECK-NEXT: - { id: 0, type: spill-slot, offset: 0, size: 4, alignment: 4 }
+fixedStack:
+  - { id: 0, type: spill-slot, offset: 0, size: 4, alignment: 4 }
+stack:
+  - { id: 0, offset: -12, size: 4, alignment: 4 }
+body:
+  - id:          0
+    name:        entry
+    instructions:
+      - 'MOV32mr %rsp, 1, _, -4, _, %edi'
+      - '%eax = COPY %edi'
+      - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/stack-objects.mir b/test/CodeGen/MIR/X86/stack-objects.mir
new file mode 100644
index 000000000000..14ed4b74f96f
--- /dev/null
+++ b/test/CodeGen/MIR/X86/stack-objects.mir
@@ -0,0 +1,39 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses stack objects correctly.
+
+--- |
+
+  define i32 @test(i32 %a) #0 {
+  entry:
+    %b = alloca i32
+    %x = alloca i64
+    store i32 %a, i32* %b
+    store i64 2, i64* %x
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+  attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name:            test
+frameInfo:
+  maxAlignment:    8
+# CHECK: stack:
+# CHECK-NEXT: - { id: 0, offset: -12, size: 4, alignment: 4 }
+# CHECK-NEXT: - { id: 1, offset: -24, size: 8, alignment: 8 }
+# CHECK-NEXT: - { id: 2, type: spill-slot, offset: -32, size: 4, alignment: 4 }
+stack:
+  - { id: 0, offset: -12, size: 4, alignment: 4 }
+  - { id: 1, offset: -24, size: 8, alignment: 8 }
+  - { id: 2, type: spill-slot, offset: -32, size: 4, alignment: 4 }
+body:
+  - id:          0
+    name:        entry
+    instructions:
+      - 'MOV32mr %rsp, 1, _, -4, _, %edi'
+      - 'MOV64mi32 %rsp, 1, _, -16, _, 2'
+      - '%eax = MOV32rm %rsp, 1, _, -4, _'
+      - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/subregister-operands.mir b/test/CodeGen/MIR/X86/subregister-operands.mir
new file mode 100644
index 000000000000..5e46fab4b058
--- /dev/null
+++ b/test/CodeGen/MIR/X86/subregister-operands.mir
@@ -0,0 +1,33 @@
+# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses subregisters in register operands
+# correctly.
+
+--- |
+
+  define zeroext i1 @t(i1 %c) {
+  entry:
+    ret i1 %c
+  }
+
+...
+---
+name:            t
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr8 }
+  - { id: 2, class: gr8 }
+body:
+  - name:        entry
+    id:          0
+    instructions:
+      # CHECK:      %0 = COPY %edi
+      # CHECK-NEXT: %1 = COPY %0:sub_8bit
+      - '%0 = COPY %edi'
+      - '%1 = COPY %0:sub_8bit'
+      - '%2 = AND8ri %1, 1, implicit-def %eflags'
+      - '%al = COPY %2'
+      - 'RETQ %al'
+...
+
diff --git a/test/CodeGen/MIR/X86/undef-register-flag.mir b/test/CodeGen/MIR/X86/undef-register-flag.mir
new file mode 100644
index 000000000000..83b9e10a80d1
--- /dev/null
+++ b/test/CodeGen/MIR/X86/undef-register-flag.mir
@@ -0,0 +1,42 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses the 'undef' register flags
+# correctly.
+
+--- |
+
+  define i32 @compute(i32 %a) #0 {
+  body:
+    %c = mul i32 %a, 11
+    ret i32 %c
+  }
+
+  define i32 @foo(i32 %a) #0 {
+  entry:
+    %b = call i32 @compute(i32 %a)
+    ret i32 %b
+  }
+
+  attributes #0 = { "no-frame-pointer-elim"="false" }
+
+...
+---
+name:            compute
+body:
+  - id:          0
+    name:        body
+    instructions:
+      - '%eax = IMUL32rri8 %edi, 11, implicit-def %eflags'
+      - 'RETQ %eax'
+...
+---
+name:            foo
+body:
+  - id:          0
+    name:        entry
+    instructions:
+      # CHECK: - 'PUSH64r undef %rax
+      - 'PUSH64r undef %rax, implicit-def %rsp, implicit %rsp'
+      - 'CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax'
+      - '%rdx = POP64r implicit-def %rsp, implicit %rsp'
+      - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/undefined-register-class.mir b/test/CodeGen/MIR/X86/undefined-register-class.mir
new file mode 100644
index 000000000000..a14d2303a7d8
--- /dev/null
+++ b/test/CodeGen/MIR/X86/undefined-register-class.mir
@@ -0,0 +1,26 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser reports an error when it encounters an
+# unknown register class.
+
+--- |
+
+  define i32 @test(i32 %a) {
+  entry:
+    ret i32 %a
+  }
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  # CHECK: [[@LINE+1]]:20: use of undefined register class 'gr3200'
+  - {id: 0, class: 'gr3200'}
+body:
+  - id:          0
+    name:        entry
+    instructions:
+      - 'RETQ %eax'
+...
+
diff --git a/test/CodeGen/MIR/X86/undefined-virtual-register.mir b/test/CodeGen/MIR/X86/undefined-virtual-register.mir
new file mode 100644
index 000000000000..12370c80caf9
--- /dev/null
+++ b/test/CodeGen/MIR/X86/undefined-virtual-register.mir
@@ -0,0 +1,28 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser reports an error when parsing a
+# reference to an undefined virtual register.
+
+--- |
+
+  define i32 @test(i32 %a) {
+  entry:
+    ret i32 %a
+  }
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+body:
+  - id:          0
+    name:        entry
+    instructions:
+      - '%0 = COPY %edi'
+      # CHECK: [[@LINE+1]]:22: use of undefined virtual register '%10'
+      - '%eax = COPY %10'
+      - 'RETQ %eax'
+...
+
diff --git a/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir b/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir
index 5bc979a83eaf..a82e9a780f54 100644
--- a/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir
+++ b/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir
@@ -26,12 +26,12 @@ body:
    name:         entry
    instructions:
      - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     - 'CMP32ri8 %eax, 10'
+     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
      # CHECK: [[@LINE+1]]:14: use of undefined machine basic block #4
-     - 'JG_1 %bb.4'
+     - 'JG_1 %bb.4, implicit %eflags'
  - id: 1
    instructions:
-     - '%eax = MOV32r0'
+     - '%eax = MOV32r0 implicit-def %eflags'
  - id: 2
    instructions:
      - 'RETQ %eax'
diff --git a/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir b/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir
index cd8c5402256f..f304113f40b9 100644
--- a/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir
+++ b/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir
@@ -25,13 +25,13 @@ body:
    name:            entry
    instructions:
      - '%eax = MOV32rm %rdi, 1, _, 0, _'
-     - 'CMP32ri8 %eax, 10'
+     - 'CMP32ri8 %eax, 10, implicit-def %eflags'
      # CHECK: [[@LINE+1]]:14: the name of machine basic block #2 isn't 'hit'
-     - 'JG_1 %bb.2.hit'
+     - 'JG_1 %bb.2.hit, implicit %eflags'
  - id:              1
    name:            less
    instructions:
-     - '%eax = MOV32r0'
+     - '%eax = MOV32r0 implicit-def %eflags'
  - id:              2
    name:            exit
    instructions:
diff --git a/test/CodeGen/MIR/X86/unknown-subregister-index.mir b/test/CodeGen/MIR/X86/unknown-subregister-index.mir
new file mode 100644
index 000000000000..50461232b623
--- /dev/null
+++ b/test/CodeGen/MIR/X86/unknown-subregister-index.mir
@@ -0,0 +1,31 @@
+# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that an error is reported when an unknown subregister index
+# is encountered.
+
+--- |
+
+  define zeroext i1 @t(i1 %c) {
+  entry:
+    ret i1 %c
+  }
+
+...
+---
+name:            t
+isSSA:           true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr8 }
+  - { id: 2, class: gr8 }
+body:
+  - name:        entry
+    id:          0
+    instructions:
+      - '%0 = COPY %edi'
+      # CHECK: [[@LINE+1]]:23: use of unknown subregister index 'bit8'
+      - '%1 = COPY %0:bit8'
+      - '%2 = AND8ri %1, 1, implicit-def %eflags'
+      - '%al = COPY %2'
+      - 'RETQ %al'
+...
diff --git a/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir b/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir
new file mode 100644
index 000000000000..8e50c52f5e18
--- /dev/null
+++ b/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir
@@ -0,0 +1,36 @@
+# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+--- |
+
+  define i32 @test(i32 %a) {
+  entry:
+    %b = alloca i32
+    %x = alloca i64
+    %y = alloca i32, i32 %a
+    store i32 %a, i32* %b
+    store i64 2, i64* %x
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+frameInfo:
+  stackSize:       24
+  offsetAdjustment: -16
+  maxAlignment:    8
+  adjustsStack:    true
+stack:
+  - { id: 0, offset: -20, size: 4, alignment: 4 }
+  - { id: 1, offset: -32, size: 8, alignment: 8 }
+  # CHECK: [[@LINE+1]]:55: unknown key 'size'
+  - { id: 2, type: variable-sized, offset: -32, size: 42, alignment: 1 }
+body:
+  - id:          0
+    name:        entry
+    instructions:
+      - 'MOV32mr %rsp, 1, _, -4, _, %edi'
+      - 'MOV64mi32 %rsp, 1, _, -16, _, 2'
+      - '%eax = MOV32rm %rsp, 1, _, -4, _'
+      - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir b/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir
new file mode 100644
index 000000000000..4c45742b25a4
--- /dev/null
+++ b/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir
@@ -0,0 +1,42 @@
+# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses variable sized stack objects
+# correctly.
+
+--- |
+
+  define i32 @test(i32 %a) {
+  entry:
+    %b = alloca i32
+    %x = alloca i64
+    %y = alloca i32, i32 %a
+    store i32 %a, i32* %b
+    store i64 2, i64* %x
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+frameInfo:
+  stackSize:       24
+  offsetAdjustment: -16
+  maxAlignment:    8
+  adjustsStack:    true
+# CHECK: stack:
+# CHECK-NEXT: - { id: 0, offset: -20, size: 4, alignment: 4 }
+# CHECK-NEXT: - { id: 1, offset: -32, size: 8, alignment: 8 }
+# CHECK-NEXT: - { id: 2, type: variable-sized, offset: -32, alignment: 1 }
+stack:
+  - { id: 0, offset: -20, size: 4, alignment: 4 }
+  - { id: 1, offset: -32, size: 8, alignment: 8 }
+  - { id: 2, type: variable-sized, offset: -32, alignment: 1 }
+body:
+  - id:          0
+    name:        entry
+    instructions:
+      - 'MOV32mr %rsp, 1, _, -4, _, %edi'
+      - 'MOV64mi32 %rsp, 1, _, -16, _, 2'
+      - '%eax = MOV32rm %rsp, 1, _, -4, _'
+      - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/X86/virtual-registers.mir b/test/CodeGen/MIR/X86/virtual-registers.mir
new file mode 100644
index 000000000000..c6d76e6a18c5
--- /dev/null
+++ b/test/CodeGen/MIR/X86/virtual-registers.mir
@@ -0,0 +1,105 @@
+# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses virtual register definitions and
+# references correctly.
+
+--- |
+
+  define i32 @bar(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+  define i32 @foo(i32 %a) {
+  entry:
+    %0 = icmp sle i32 %a, 10
+    br i1 %0, label %less, label %exit
+
+  less:
+    ret i32 0
+
+  exit:
+    ret i32 %a
+  }
+
+...
+---
+name:            bar
+isSSA:           true
+tracksRegLiveness: true
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
+# CHECK-NEXT:   - { id: 2, class: gr32 }
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: gr32 }
+body:
+  - id:          0
+    name:        entry
+    # CHECK:      %0 = COPY %edi
+    # CHECK-NEXT: %1 = SUB32ri8 %0, 10
+    instructions:
+      - '%0 = COPY %edi'
+      - '%1 = SUB32ri8 %0, 10, implicit-def %eflags'
+      - 'JG_1 %bb.2.exit, implicit %eflags'
+      - 'JMP_1 %bb.1.less'
+  - id:          1
+    name:        less
+    # CHECK:      %2 = MOV32r0
+    # CHECK-NEXT: %eax = COPY %2
+    instructions:
+      - '%2 = MOV32r0 implicit-def %eflags'
+      - '%eax = COPY %2'
+      - 'RETQ %eax'
+  - id:          2
+    name:        exit
+    instructions:
+      - '%eax = COPY %0'
+      - 'RETQ %eax'
+...
+---
+name:            foo
+isSSA:           true
+tracksRegLiveness: true
+# CHECK: name: foo
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
+# CHECK-NEXT:   - { id: 2, class: gr32 }
+registers:
+  - { id: 2, class: gr32 }
+  - { id: 0, class: gr32 }
+  - { id: 10, class: gr32 }
+body:
+  - id:          0
+    name:        entry
+    # CHECK:      %0 = COPY %edi
+    # CHECK-NEXT: %1 = SUB32ri8 %0, 10
+    instructions:
+      - '%2 = COPY %edi'
+      - '%0 = SUB32ri8 %2, 10, implicit-def %eflags'
+      - 'JG_1 %bb.2.exit, implicit %eflags'
+      - 'JMP_1 %bb.1.less'
+  - id:          1
+    name:        less
+    # CHECK:      %2 = MOV32r0
+    # CHECK-NEXT: %eax = COPY %2
+    instructions:
+      - '%10 = MOV32r0 implicit-def %eflags'
+      - '%eax = COPY %10'
+      - 'RETQ %eax'
+  - id:          2
+    name:        exit
+    # CHECK: %eax = COPY %0
+    instructions:
+      - '%eax = COPY %2'
+      - 'RETQ %eax'
+...
diff --git a/test/CodeGen/MIR/frame-info.mir b/test/CodeGen/MIR/frame-info.mir
new file mode 100644
index 000000000000..c5468f94f33a
--- /dev/null
+++ b/test/CodeGen/MIR/frame-info.mir
@@ -0,0 +1,91 @@
+# RUN: llc -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# This test ensures that the MIR parser parses machine frame info properties
+# correctly.
+
+--- |
+
+  define i32 @test(i32 %a) {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+  define i32 @test2(i32 %a) {
+  entry:
+    %b = alloca i32
+    store i32 %a, i32* %b
+    %c = load i32, i32* %b
+    ret i32 %c
+  }
+
+...
+---
+name:            test
+isSSA:           true
+tracksRegLiveness: true
+
+# CHECK: frameInfo:
+# CHECK-NEXT: isFrameAddressTaken: false
+# CHECK-NEXT: isReturnAddressTaken: false
+# CHECK-NEXT: hasStackMap: false
+# CHECK-NEXT: hasPatchPoint: false
+# CHECK-NEXT: stackSize: 0
+# CHECK-NEXT: offsetAdjustment: 0
+# Note: max alignment can be target specific when printed.
+# CHECK-NEXT: maxAlignment:
+# CHECK-NEXT: adjustsStack: false
+# CHECK-NEXT: hasCalls: false
+# CHECK-NEXT: maxCallFrameSize: 0
+# CHECK-NEXT: hasOpaqueSPAdjustment: false
+# CHECK-NEXT: hasVAStart: false
+# CHECK-NEXT: hasMustTailInVarArgFunc: false
+# CHECK: body
+frameInfo:
+  maxAlignment:    4
+body:
+  - id:          0
+    name:        entry
+...
+---
+name:            test2
+isSSA:           true
+tracksRegLiveness: true
+
+# CHECK: test2
+# CHECK: frameInfo:
+# CHECK-NEXT: isFrameAddressTaken: true
+# CHECK-NEXT: isReturnAddressTaken: true
+# CHECK-NEXT: hasStackMap: true
+# CHECK-NEXT: hasPatchPoint: true
+# CHECK-NEXT: stackSize: 4
+# CHECK-NEXT: offsetAdjustment: 4
+# Note: max alignment can be target specific when printed.
+# CHECK-NEXT: maxAlignment:
+# CHECK-NEXT: adjustsStack: true
+# CHECK-NEXT: hasCalls: true
+# CHECK-NEXT: maxCallFrameSize: 4
+# CHECK-NEXT: hasOpaqueSPAdjustment: true
+# CHECK-NEXT: hasVAStart: true
+# CHECK-NEXT: hasMustTailInVarArgFunc: true
+# CHECK: body
+frameInfo:
+  isFrameAddressTaken: true
+  isReturnAddressTaken: true
+  hasStackMap:     true
+  hasPatchPoint:   true
+  stackSize:       4
+  offsetAdjustment: 4
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 4
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      true
+  hasMustTailInVarArgFunc: true
+body:
+  - id:          0
+    name:        entry
+...
+
diff --git a/test/CodeGen/MIR/llvmIR.mir b/test/CodeGen/MIR/llvmIR.mir
index 4d7fde240c5b..3c084ad7d393 100644
--- a/test/CodeGen/MIR/llvmIR.mir
+++ b/test/CodeGen/MIR/llvmIR.mir
@@ -32,4 +32,6 @@
 ...
 ---
 name: foo
+body:
+  - id: 0
 ...
diff --git a/test/CodeGen/MIR/llvmIRMissing.mir b/test/CodeGen/MIR/llvmIRMissing.mir
index 83d846ba44c3..80cea5a6fdaa 100644
--- a/test/CodeGen/MIR/llvmIRMissing.mir
+++ b/test/CodeGen/MIR/llvmIRMissing.mir
@@ -4,4 +4,6 @@
 ---
 # CHECK: name: foo
 name: foo
+body:
+  - id: 0
 ...
diff --git a/test/CodeGen/MIR/machine-basic-block-unknown-name.mir b/test/CodeGen/MIR/machine-basic-block-unknown-name.mir
index ed675c5edbc3..df8eee9d2708 100644
--- a/test/CodeGen/MIR/machine-basic-block-unknown-name.mir
+++ b/test/CodeGen/MIR/machine-basic-block-unknown-name.mir
@@ -13,7 +13,7 @@
 ---
 name:            foo
 body:
-  # CHECK: basic block 'entrie' is not defined in the function 'foo'
+  # CHECK: [[@LINE+2]]:18: basic block 'entrie' is not defined in the function 'foo'
   - id:          0
     name:        entrie
 ...
diff --git a/test/CodeGen/MIR/machine-function-missing-body-error.mir b/test/CodeGen/MIR/machine-function-missing-body-error.mir
new file mode 100644
index 000000000000..0dc7477f6275
--- /dev/null
+++ b/test/CodeGen/MIR/machine-function-missing-body-error.mir
@@ -0,0 +1,15 @@
+# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser reports an error when it encounters a
+# machine function with an empty body.
+
+--- |
+
+  define i32 @foo() {
+    ret i32 0
+  }
+
+...
+---
+# CHECK: machine function 'foo' requires at least one machine basic block in its body
+name:            foo
+...
diff --git a/test/CodeGen/MIR/machine-function-missing-function.mir b/test/CodeGen/MIR/machine-function-missing-function.mir
index eed4142d6597..424c34aae847 100644
--- a/test/CodeGen/MIR/machine-function-missing-function.mir
+++ b/test/CodeGen/MIR/machine-function-missing-function.mir
@@ -12,8 +12,12 @@
 ...
 ---
 name:            foo
+body:
+  - id: 0
 ...
 ---
 # CHECK: function 'faa' isn't defined in the provided LLVM IR
 name:            faa
+body:
+  - id: 0
 ...
diff --git a/test/CodeGen/MIR/machine-function-missing-name.mir b/test/CodeGen/MIR/machine-function-missing-name.mir
index b16156e54bd1..a868a65d35f2 100644
--- a/test/CodeGen/MIR/machine-function-missing-name.mir
+++ b/test/CodeGen/MIR/machine-function-missing-name.mir
@@ -16,7 +16,11 @@
 ---
 # CHECK: [[@LINE+1]]:1: missing required key 'name'
 nme:             foo
+body:
+  - id: 0
 ...
 ---
 name:            bar
+body:
+  - id: 0
 ...
diff --git a/test/CodeGen/MIR/machine-function.mir b/test/CodeGen/MIR/machine-function.mir
index 8f053adc22be..afd10ab02c26 100644
--- a/test/CodeGen/MIR/machine-function.mir
+++ b/test/CodeGen/MIR/machine-function.mir
@@ -27,6 +27,8 @@
 # CHECK-NEXT: hasInlineAsm: false
 # CHECK: ...
 name:            foo
+body:
+  - id: 0
 ...
 ---
 # CHECK: name: bar
@@ -35,6 +37,8 @@ name:            foo
 # CHECK-NEXT: hasInlineAsm: false
 # CHECK: ...
 name:            bar
+body:
+  - id: 0
 ...
 ---
 # CHECK: name: func
@@ -44,6 +48,8 @@ name:            bar
 # CHECK: ...
 name:            func
 alignment:       8
+body:
+  - id: 0
 ...
 ---
 # CHECK: name: func2
@@ -55,4 +61,6 @@ name:            func2
 alignment:       16
 exposesReturnsTwice: true
 hasInlineAsm:    true
+body:
+  - id: 0
 ...
diff --git a/test/CodeGen/MIR/register-info.mir b/test/CodeGen/MIR/register-info.mir
index c01997b46859..9585faa96223 100644
--- a/test/CodeGen/MIR/register-info.mir
+++ b/test/CodeGen/MIR/register-info.mir
@@ -22,6 +22,8 @@
 # CHECK-NEXT: tracksSubRegLiveness: false
 # CHECK: ...
 name:            foo
+body:
+  - id: 0
 ...
 ---
 # CHECK: name: bar
@@ -33,4 +35,6 @@ name: bar
 isSSA: false
 tracksRegLiveness: true
 tracksSubRegLiveness: true
+body:
+  - id: 0
 ...
diff --git a/test/CodeGen/NVPTX/loop-vectorize.ll b/test/CodeGen/NVPTX/loop-vectorize.ll
new file mode 100644
index 000000000000..1b337441ac96
--- /dev/null
+++ b/test/CodeGen/NVPTX/loop-vectorize.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -O3 -S | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define void @no_vectorization(i32 %n, i32 %a, i32 %b) {
+; CHECK-LABEL: no_vectorization(
+; CHECK-NOT: <4 x i32>
+; CHECK-NOT: <4 x i1>
+entry:
+  %cmp.5 = icmp sgt i32 %n, 0
+  br i1 %cmp.5, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %add = add nsw i32 %i.06, %a
+  %mul = mul nsw i32 %add, %b
+  %cmp1 = icmp sgt i32 %mul, -1
+  tail call void @llvm.assume(i1 %cmp1)
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare void @llvm.assume(i1) #0
+
+attributes #0 = { nounwind }
+
+!nvvm.annotations = !{!0}
+!0 = !{void (i32, i32, i32)* @no_vectorization, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/lower-aggr-copies.ll b/test/CodeGen/NVPTX/lower-aggr-copies.ll
new file mode 100644
index 000000000000..c3adfc4646cf
--- /dev/null
+++ b/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+
+; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
+; llvm.mem* intrinsics get lowered to loops.
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
+define i8* @memcpy_caller(i8* %dst, i8* %src, i64 %n) #0 {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 false)
+  ret i8* %dst
+; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memcpy_caller
+; CHECK: LBB[[LABEL:[_0-9]+]]:
+; CHECK:      ld.u8 %rs[[REG:[0-9]+]]
+; CHECK:      st.u8 [%r{{[0-9]+}}], %rs[[REG]]
+; CHECK:      add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
+; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
+; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
+}
+
+define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 true)
+  ret i8* %dst
+; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memcpy_volatile_caller
+; CHECK: LBB[[LABEL:[_0-9]+]]:
+; CHECK:      ld.volatile.u8 %rs[[REG:[0-9]+]]
+; CHECK:      st.volatile.u8 [%r{{[0-9]+}}], %rs[[REG]]
+; CHECK:      add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
+; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
+; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
+}
+
+define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
+entry:
+  %0 = trunc i32 %c to i8
+  tail call void @llvm.memset.p0i8.i64(i8* %dst, i8 %0, i64 %n, i32 1, i1 false)
+  ret i8* %dst
+; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memset_caller(
+; CHECK:      ld.param.u8 %rs[[REG:[0-9]+]]
+; CHECK:      LBB[[LABEL:[_0-9]+]]:
+; CHECK:      st.u8 [%r{{[0-9]+}}], %rs[[REG]]
+; CHECK:      add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
+; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
+; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
+}
diff --git a/test/CodeGen/PowerPC/builtins-ppc-elf2-abi.ll b/test/CodeGen/PowerPC/builtins-ppc-elf2-abi.ll
index 16dc2ccb111d..6013a412924f 100644
--- a/test/CodeGen/PowerPC/builtins-ppc-elf2-abi.ll
+++ b/test/CodeGen/PowerPC/builtins-ppc-elf2-abi.ll
@@ -134,6 +134,36 @@ entry:
 ; CHECK: xvcmpgtsp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
 }
 
+; Function Attrs: nounwind
+define <4 x float> @emit_xvresp(<4 x float> %a) {
+entry:
+  %a.addr = alloca <4 x float>, align 16
+  store <4 x float> %a, <4 x float>* %a.addr, align 16
+  %0 = load <4 x float>, <4 x float>* %a.addr, align 16
+  %1 = call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> %0)
+  ret <4 x float> %1
+; CHECK-LABEL: @emit_xvresp
+; CHECK: xvresp {{[0-9]+}}, {{[0-9]+}}
+}
+
+; Function Attrs: nounwind
+define <2 x double> @emit_xvredp(<2 x double> %a) {
+entry:
+  %a.addr = alloca <2 x double>, align 16
+  store <2 x double> %a, <2 x double>* %a.addr, align 16
+  %0 = load <2 x double>, <2 x double>* %a.addr, align 16
+  %1 = call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> %0)
+  ret <2 x double> %1
+; CHECK-LABEL: @emit_xvredp
+; CHECK: xvredp {{[0-9]+}}, {{[0-9]+}}
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.ppc.vsx.xvresp(<4 x float>)
+
+; Function Attrs: nounwind readnone
+declare <2 x double> @llvm.ppc.vsx.xvredp(<2 x double>)
+
 ; Function Attrs: nounwind readnone
 declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
 
diff --git a/test/CodeGen/PowerPC/ppc-crbits-onoff.ll b/test/CodeGen/PowerPC/ppc-crbits-onoff.ll
index 88648df5fa36..c69f30017d88 100644
--- a/test/CodeGen/PowerPC/ppc-crbits-onoff.ll
+++ b/test/CodeGen/PowerPC/ppc-crbits-onoff.ll
@@ -15,8 +15,8 @@ entry:
 ; CHECK-DAG: cmplwi {{[0-9]+}}, 3, 0
 ; CHECK-DAG: li [[REG2:[0-9]+]], 1
 ; CHECK-DAG: cntlzw [[REG3:[0-9]+]],
-; CHECK: isel 3, 0, [[REG2]]
-; CHECK: and 3, 3, [[REG3]]
+; CHECK: isel [[REG4:[0-9]+]], 0, [[REG2]]
+; CHECK: and 3, [[REG4]], [[REG3]]
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/ppc32-nest.ll b/test/CodeGen/PowerPC/ppc32-nest.ll
new file mode 100644
index 000000000000..ed7bbe2b8f37
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc32-nest.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-linux-gnu"
+
+; Tests that the 'nest' parameter attribute causes the relevant parameter to be
+; passed in the right register (r11 for PPC).
+
+define i8* @nest_receiver(i8* nest %arg) nounwind {
+; CHECK-LABEL: nest_receiver:
+; CHECK: # BB#0:
+; CHECK-NEXT: mr 3, 11
+; CHECK-NEXT: blr
+
+  ret i8* %arg
+}
+
+define i8* @nest_caller(i8* %arg) nounwind {
+; CHECK-LABEL: nest_caller:
+; CHECK: mr 11, 3
+; CHECK-NEXT: bl nest_receiver
+; CHECK: blr
+
+  %result = call i8* @nest_receiver(i8* nest %arg)
+  ret i8* %result
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64-anyregcc.ll b/test/CodeGen/PowerPC/ppc64-anyregcc.ll
index 66f6a2c790c6..ff0768ff47ed 100644
--- a/test/CodeGen/PowerPC/ppc64-anyregcc.ll
+++ b/test/CodeGen/PowerPC/ppc64-anyregcc.ll
@@ -82,7 +82,7 @@ target triple = "powerpc64-unknown-linux-gnu"
 ; CHECK-NEXT:   .long 3
 define i64 @test() nounwind ssp uwtable {
 entry:
-  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 0, i32 24, i8* null, i32 2, i32 1, i32 2, i64 3)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 0, i32 40, i8* null, i32 2, i32 1, i32 2, i64 3)
   ret i64 0
 }
 
@@ -104,7 +104,7 @@ entry:
 define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 1, i32 24, i8* %f, i32 1, i8* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 1, i32 40, i8* %f, i32 1, i8* %obj)
   ret i64 %ret
 }
 
@@ -127,7 +127,7 @@ define i64 @property_access2() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 24, i8* %f, i32 1, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 40, i8* %f, i32 1, i64* %obj)
   ret i64 %ret
 }
 
@@ -150,7 +150,7 @@ define i64 @property_access3() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 3, i32 24, i8* %f, i32 0, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 3, i32 40, i8* %f, i32 0, i64* %obj)
   ret i64 %ret
 }
 
@@ -232,7 +232,7 @@ entry:
 define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 4, i32 24, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 4, i32 40, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -314,7 +314,7 @@ entry:
 define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 24, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 40, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -342,7 +342,7 @@ entry:
 ; CHECK-NEXT: .long  0
 define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 24, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 40, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
   tail call void asm sideeffect "nop", "~{r0},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r14},~{r15},~{r16},~{r17
 },~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28},~{r29},~{r30},~{r31}"() nounwind
   ret i64 %result
@@ -384,7 +384,7 @@ define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
   tail call void asm sideeffect "nop", "~{r0},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r14},~{r15},~{r16},~{r17
 },~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28},~{r29},~{r30},~{r31}"() nounwind
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 13, i32 24, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 13, i32 40, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   ret i64 %result
 }
 
diff --git a/test/CodeGen/PowerPC/ppc64-fastcc-fast-isel.ll b/test/CodeGen/PowerPC/ppc64-fastcc-fast-isel.ll
index f90519836c25..92d6d556738c 100644
--- a/test/CodeGen/PowerPC/ppc64-fastcc-fast-isel.ll
+++ b/test/CodeGen/PowerPC/ppc64-fastcc-fast-isel.ll
@@ -35,7 +35,7 @@ define fastcc double @f2(i64 %g1, double %f1, i64 %g2, double %f2, i64 %g3, doub
 }
 
 define void @cg2(i64 %v) #0 {
-  tail call fastcc i64 @g1(i64 0, double 0.0, i64 %v, double 0.0, i64 0, double 0.0, i64 0, double 0.0)
+  call fastcc i64 @g1(i64 0, double 0.0, i64 %v, double 0.0, i64 0, double 0.0, i64 0, double 0.0)
   ret void
 
 ; CHECK-LABEL: @cg2
@@ -44,11 +44,11 @@ define void @cg2(i64 %v) #0 {
 }
 
 define void @cf2(double %v) #0 {
-  tail call fastcc i64 @g1(i64 0, double 0.0, i64 0, double %v, i64 0, double 0.0, i64 0, double 0.0)
+  call fastcc i64 @g1(i64 0, double 0.0, i64 0, double %v, i64 0, double 0.0, i64 0, double 0.0)
   ret void
 
 ; CHECK-LABEL: @cf2
-; CHECK: mr 2, 1
+; CHECK: fmr 2, 1
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/ppc64-fastcc.ll b/test/CodeGen/PowerPC/ppc64-fastcc.ll
index bb1365a3b675..69e15d104da8 100644
--- a/test/CodeGen/PowerPC/ppc64-fastcc.ll
+++ b/test/CodeGen/PowerPC/ppc64-fastcc.ll
@@ -521,8 +521,9 @@ define void @cv13(<4 x i32> %v) #0 {
   ret void
 
 ; CHECK-LABEL: @cv13
-; CHECK: li [[REG1:[0-9]+]], 96
-; CHECK: stvx 2, 1, [[REG1]]
+; CHECK-DAG: li [[REG1:[0-9]+]], 96
+; CHECK-DAG: vor [[REG2:[0-9]+]], 2, 2
+; CHECK: stvx [[REG2]], 1, [[REG1]]
 ; CHECK: blr
 }
 
@@ -531,8 +532,9 @@ define void @cv14(<4 x i32> %v) #0 {
   ret void
 
 ; CHECK-LABEL: @cv14
-; CHECK: li [[REG1:[0-9]+]], 128
-; CHECK: stvx 2, 1, [[REG1]]
+; CHECK-DAG: li [[REG1:[0-9]+]], 128
+; CHECK-DAG: vor [[REG2:[0-9]+]], 2, 2
+; CHECK: stvx [[REG2]], 1, [[REG1]]
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/ppc64-nest.ll b/test/CodeGen/PowerPC/ppc64-nest.ll
new file mode 100644
index 000000000000..9dd88db2fb5d
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-nest.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Tests that the 'nest' parameter attribute causes the relevant parameter to be
+; passed in the right register (r11 for PPC).
+
+define i8* @nest_receiver(i8* nest %arg) nounwind {
+; CHECK-LABEL: nest_receiver:
+; CHECK: # BB#0:
+; CHECK-NEXT: mr 3, 11
+; CHECK-NEXT: blr
+
+  ret i8* %arg
+}
+
+define i8* @nest_caller(i8* %arg) nounwind {
+; CHECK-LABEL: nest_caller:
+; CHECK: mr 11, 3
+; CHECK-NEXT: bl nest_receiver
+; CHECK: blr
+
+  %result = call i8* @nest_receiver(i8* nest %arg)
+  ret i8* %result
+}
+
+define void @test_indirect(i32 ()* nocapture %f, i8* %p) {
+entry:
+
+; CHECK-LABEL: test_indirect
+; CHECK-DAG: ld [[DEST:[0-9]+]], 0(3)
+; CHECK-DAG: ld 2, 8(3)
+; CHECK-DAG: mr 11, 4
+; CHECK: mtctr [[DEST]]
+; CHECK: bctrl
+; CHECK: blr
+
+  %callee.knr.cast = bitcast i32 ()* %f to i32 (i8*)*
+  %call = tail call signext i32 %callee.knr.cast(i8* nest %p)
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64-patchpoint.ll b/test/CodeGen/PowerPC/ppc64-patchpoint.ll
index 67b26268a3a3..53b737ae9a0b 100644
--- a/test/CodeGen/PowerPC/ppc64-patchpoint.ll
+++ b/test/CodeGen/PowerPC/ppc64-patchpoint.ll
@@ -15,22 +15,34 @@ entry:
 ; CHECK-NEXT: rldic 12, 12, 32, 16
 ; CHECK-NEXT: oris 12, 12, 48879
 ; CHECK-NEXT: ori 12, 12, 51966
+; CHECK-LE-NEXT: std 2, 24(1)
+; CHECK-BE-NEXT: std 2, 40(1)
+; CHECK-BE-NEXT: ld 2, 8(12)
+; CHECK-BE-NEXT: ld 12, 0(12)
 ; CHECK-NEXT: mtctr 12
 ; CHECK-NEXT: bctrl
+; CHECK-LE-NEXT: ld 2, 24(1)
+; CHECK-BE-NEXT: ld 2, 40(1)
 
 ; CHECK: li 12, -8531
 ; CHECK-NEXT: rldic 12, 12, 32, 16
 ; CHECK-NEXT: oris 12, 12, 48879
 ; CHECK-NEXT: ori 12, 12, 51967
+; CHECK-LE-NEXT: std 2, 24(1)
+; CHECK-BE-NEXT: std 2, 40(1)
+; CHECK-BE-NEXT: ld 2, 8(12)
+; CHECK-BE-NEXT: ld 12, 0(12)
 ; CHECK-NEXT: mtctr 12
 ; CHECK-NEXT: bctrl
+; CHECK-LE-NEXT: ld 2, 24(1)
+; CHECK-BE-NEXT: ld 2, 40(1)
 
 ; CHECK: blr
 
   %resolveCall2 = inttoptr i64 244837814094590 to i8*
-  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 24, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 40, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   %resolveCall3 = inttoptr i64 244837814094591 to i8*
-  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 24, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 40, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
   ret i64 %result
 }
 
@@ -65,13 +77,13 @@ entry:
   %tmp81 = inttoptr i64 %tmp80 to i64*
   %tmp82 = load i64, i64* %tmp81, align 8
   tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
-  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 15, i32 48, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
   %tmp83 = load i64, i64* %tmp33, align 8
   %tmp84 = add i64 %tmp83, -24
   %tmp85 = inttoptr i64 %tmp84 to i64*
   %tmp86 = load i64, i64* %tmp85, align 8
   tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
-  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 18, i32 48, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
   ret i64 10
 }
 
diff --git a/test/CodeGen/PowerPC/ppc64-stackmap.ll b/test/CodeGen/PowerPC/ppc64-stackmap.ll
index 917fa7422512..a77339f8e475 100644
--- a/test/CodeGen/PowerPC/ppc64-stackmap.ll
+++ b/test/CodeGen/PowerPC/ppc64-stackmap.ll
@@ -112,7 +112,7 @@ target triple = "powerpc64-unknown-linux-gnu"
 define void @constantargs() {
 entry:
   %0 = inttoptr i64 244837814094590 to i8*
-  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 24, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 40, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
   ret void
 }
 
@@ -160,7 +160,7 @@ entry:
 cold:
   ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
   %thunk = inttoptr i64 244837814094590 to i8*
-  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4, i32 24, i8* %thunk, i32 0, i64 %a, i64 %b)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4, i32 40, i8* %thunk, i32 0, i64 %a, i64 %b)
   unreachable
 ret:
   ret void
@@ -176,7 +176,7 @@ ret:
 define i64 @propertyRead(i64* %obj) {
 entry:
   %resolveRead = inttoptr i64 244837814094590 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 24, i8* %resolveRead, i32 1, i64* %obj)
+  %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 40, i8* %resolveRead, i32 1, i64* %obj)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -196,7 +196,7 @@ entry:
 define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
 entry:
   %resolveWrite = inttoptr i64 244837814094590 to i8*
-  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 24, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 40, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
   ret void
 }
 
@@ -218,7 +218,7 @@ entry:
 define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 244837814094590 to i8*
-  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 7, i32 24, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 7, i32 40, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   ret void
 }
 
@@ -240,7 +240,7 @@ entry:
 define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 244837814094590 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 8, i32 24, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 8, i32 40, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -260,7 +260,7 @@ entry:
 ; CHECK-NEXT:   .short 31
 define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
 entry:
-  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 24, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 40, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
   ret void
 }
 
diff --git a/test/CodeGen/PowerPC/recipest.ll b/test/CodeGen/PowerPC/recipest.ll
index cd77548b281b..41dcb0f5b3fc 100644
--- a/test/CodeGen/PowerPC/recipest.ll
+++ b/test/CodeGen/PowerPC/recipest.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math -mattr=-vsx -recip=sqrtf:0,sqrtd:0 | FileCheck %s -check-prefix=CHECK-NONR
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck -check-prefix=CHECK-SAFE %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
@@ -24,6 +25,13 @@ define double @foo(double %a, double %b) nounwind {
 ; CHECK-NEXT: fmul
 ; CHECK: blr
 
+; CHECK-NONR: @foo
+; CHECK-NONR: frsqrte
+; CHECK-NONR-NOT: fmadd
+; CHECK-NONR: fmul
+; CHECK-NONR-NOT: fmadd
+; CHECK-NONR: blr
+
 ; CHECK-SAFE: @foo
 ; CHECK-SAFE: fsqrt
 ; CHECK-SAFE: fdiv
@@ -90,6 +98,13 @@ define float @goo(float %a, float %b) nounwind {
 ; CHECK-NEXT: fmuls
 ; CHECK-NEXT: blr
 
+; CHECK-NONR: @goo
+; CHECK-NONR: frsqrtes
+; CHECK-NONR-NOT: fmadds
+; CHECK-NONR: fmuls
+; CHECK-NONR-NOT: fmadds
+; CHECK-NONR: blr
+
 ; CHECK-SAFE: @goo
 ; CHECK-SAFE: fsqrts
 ; CHECK-SAFE: fdivs
diff --git a/test/CodeGen/PowerPC/sjlj.ll b/test/CodeGen/PowerPC/sjlj.ll
index 62403e711968..dcbdd69d5d50 100644
--- a/test/CodeGen/PowerPC/sjlj.ll
+++ b/test/CodeGen/PowerPC/sjlj.ll
@@ -18,10 +18,10 @@ entry:
 ; CHECK: addi [[REG]], [[REG]], env_sigill@toc@l
 ; CHECK: ld 31, 0([[REG]])
 ; CHECK: ld [[REG2:[0-9]+]], 8([[REG]])
-; CHECK: ld 1, 16([[REG]])
-; CHECK: mtctr [[REG2]]
-; CHECK: ld 30, 32([[REG]])
-; CHECK: ld 2, 24([[REG]])
+; CHECK-DAG: ld 1, 16([[REG]])
+; CHECK-DAG: mtctr [[REG2]]
+; CHECK-DAG: ld 30, 32([[REG]])
+; CHECK-DAG: ld 2, 24([[REG]])
 ; CHECK: bctr
 
 return:                                           ; No predecessors!
diff --git a/test/CodeGen/PowerPC/swaps-le-3.ll b/test/CodeGen/PowerPC/swaps-le-3.ll
index 0c1748df9fcd..49b93976d310 100644
--- a/test/CodeGen/PowerPC/swaps-le-3.ll
+++ b/test/CodeGen/PowerPC/swaps-le-3.ll
@@ -17,8 +17,8 @@ entry:
 }
 
 ; CHECK-LABEL: @test
-; CHECK: xxspltd
-; CHECK: lxvd2x
+; CHECK-DAG: xxspltd
+; CHECK-DAG: lxvd2x
 ; CHECK: xvadddp
 ; CHECK: stxvd2x
 ; CHECK-NOT: xxswapd
diff --git a/test/CodeGen/PowerPC/swaps-le-5.ll b/test/CodeGen/PowerPC/swaps-le-5.ll
new file mode 100644
index 000000000000..5cd739a0efa9
--- /dev/null
+++ b/test/CodeGen/PowerPC/swaps-le-5.ll
@@ -0,0 +1,70 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu -O3 < %s | FileCheck %s
+
+; These tests verify that VSX swap optimization works for various
+; manipulations of <2 x double> vectors.
+
+@x = global <2 x double> <double 9.970000e+01, double -1.032220e+02>, align 16
+@z = global <2 x double> <double 2.332000e+01, double 3.111111e+01>, align 16
+
+define void @bar0(double %y) {
+entry:
+  %0 = load <2 x double>, <2 x double>* @x, align 16
+  %vecins = insertelement <2 x double> %0, double %y, i32 0
+  store <2 x double> %vecins, <2 x double>* @z, align 16
+  ret void
+}
+
+; CHECK-LABEL: @bar0
+; CHECK-DAG: xxswapd {{[0-9]+}}, 1
+; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
+; CHECK-DAG: xxspltd [[REG2:[0-9]+]]
+; CHECK: xxpermdi [[REG3:[0-9]+]], [[REG2]], [[REG1]], 1
+; CHECK: stxvd2x [[REG3]]
+
+define void @bar1(double %y) {
+entry:
+  %0 = load <2 x double>, <2 x double>* @x, align 16
+  %vecins = insertelement <2 x double> %0, double %y, i32 1
+  store <2 x double> %vecins, <2 x double>* @z, align 16
+  ret void
+}
+
+; CHECK-LABEL: @bar1
+; CHECK-DAG: xxswapd {{[0-9]+}}, 1
+; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
+; CHECK-DAG: xxspltd [[REG2:[0-9]+]]
+; CHECK: xxmrghd [[REG3:[0-9]+]], [[REG1]], [[REG2]]
+; CHECK: stxvd2x [[REG3]]
+
+define void @baz0() {
+entry:
+  %0 = load <2 x double>, <2 x double>* @z, align 16
+  %1 = load <2 x double>, <2 x double>* @x, align 16
+  %vecins = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 0, i32 2>
+  store <2 x double> %vecins, <2 x double>* @z, align 16
+  ret void
+}
+
+; CHECK-LABEL: @baz0
+; CHECK: lxvd2x
+; CHECK: lxvd2x
+; CHECK: xxmrghd
+; CHECK: stxvd2x
+; CHECK-NOT: xxswapd
+
+define void @baz1() {
+entry:
+  %0 = load <2 x double>, <2 x double>* @z, align 16
+  %1 = load <2 x double>, <2 x double>* @x, align 16
+  %vecins = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 3, i32 1>
+  store <2 x double> %vecins, <2 x double>* @z, align 16
+  ret void
+}
+
+; CHECK-LABEL: @baz1
+; CHECK: lxvd2x
+; CHECK: lxvd2x
+; CHECK: xxmrgld
+; CHECK: stxvd2x
+; CHECK-NOT: xxswapd
+
diff --git a/test/CodeGen/PowerPC/tls-store2.ll b/test/CodeGen/PowerPC/tls-store2.ll
index e9aa17e8c0ff..649508637f4e 100644
--- a/test/CodeGen/PowerPC/tls-store2.ll
+++ b/test/CodeGen/PowerPC/tls-store2.ll
@@ -29,6 +29,8 @@ entry:
 ; CHECK: addi 3, {{[0-9]+}}, __once_call@got@tlsgd@l
 ; CHECK: bl __tls_get_addr(__once_call@tlsgd)
 ; CHECK-NEXT: nop
-; CHECK: std {{[0-9]+}}, 0(3)
+; FIXME: We don't really need the copy here either, we could move the store up.
+; CHECK: mr [[REG1:[0-9]+]], 3
+; CHECK: std {{[0-9]+}}, 0([[REG1]])
 
 declare void @__once_call_impl()
diff --git a/test/CodeGen/PowerPC/vsx-elementary-arith.ll b/test/CodeGen/PowerPC/vsx-elementary-arith.ll
index d8f76bb989e7..5416f667aef1 100644
--- a/test/CodeGen/PowerPC/vsx-elementary-arith.ll
+++ b/test/CodeGen/PowerPC/vsx-elementary-arith.ll
@@ -116,5 +116,36 @@ entry:
 ; CHECK: xssqrtdp {{[0-9]+}}
 }
 
+; Vector forms
+; Function Attrs: nounwind
+define <4 x float> @emit_xvrsqrtesp() {
+entry:
+; CHECK-LABEL: @emit_xvrsqrtesp
+  %vf = alloca <4 x float>, align 16
+  %vfr = alloca <4 x float>, align 16
+  %0 = load <4 x float>, <4 x float>* %vf, align 16
+  %call = call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %0)
+; CHECK: xvrsqrtesp {{[0-9]+}}, {{[0-9]+}}
+  ret <4 x float> %call
+}
+
+; Function Attrs: nounwind
+define <2 x double> @emit_xvrsqrtedp() {
+entry:
+; CHECK-LABEL: @emit_xvrsqrtedp
+  %vd = alloca <2 x double>, align 16
+  %vdr = alloca <2 x double>, align 16
+  %0 = load <2 x double>, <2 x double>* %vd, align 16
+  %call = call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %0)
+  ret <2 x double> %call
+; CHECK: xvrsqrtedp {{[0-9]+}}, {{[0-9]+}}
+}
+
 ; Function Attrs: nounwind
 declare double @sqrt(double)
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float>)
+
+; Function Attrs: nounwind readnone
+declare <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double>)
diff --git a/test/CodeGen/PowerPC/vsx-fma-m.ll b/test/CodeGen/PowerPC/vsx-fma-m.ll
index d85927396e3e..4f556b6b79c2 100644
--- a/test/CodeGen/PowerPC/vsx-fma-m.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-m.ll
@@ -49,12 +49,13 @@ entry:
 ; CHECK-LABEL: @test2
 ; CHECK-DAG: li [[C1:[0-9]+]], 8
 ; CHECK-DAG: li [[C2:[0-9]+]], 16
-; CHECK-DAG: xsmaddmdp 3, 2, 1
-; CHECK-DAG: xsmaddmdp 4, 2, 1
-; CHECK-DAG: xsmaddadp 1, 2, 5
-; CHECK-DAG: stxsdx 3, 0, 8
-; CHECK-DAG: stxsdx 4, 8, [[C1]]
-; CHECK-DAG: stxsdx 1, 8, [[C2]]
+; FIXME: We no longer get this because of copy ordering at the MI level.
+; CHECX-DAG: xsmaddmdp 3, 2, 1
+; CHECX-DAG: xsmaddmdp 4, 2, 1
+; CHECX-DAG: xsmaddadp 1, 2, 5
+; CHECX-DAG: stxsdx 3, 0, 8
+; CHECX-DAG: stxsdx 4, 8, [[C1]]
+; CHECX-DAG: stxsdx 1, 8, [[C2]]
 ; CHECK: blr
 
 ; CHECK-FISL-LABEL: @test2
@@ -213,14 +214,15 @@ entry:
   ret void
 
 ; CHECK-LABEL: @testv2
-; CHECK-DAG: xvmaddmdp 36, 35, 34
-; CHECK-DAG: xvmaddmdp 37, 35, 34
-; CHECK-DAG: li [[C1:[0-9]+]], 16
-; CHECK-DAG: li [[C2:[0-9]+]], 32
-; CHECK-DAG: xvmaddadp 34, 35, 38
-; CHECK-DAG: stxvd2x 36, 0, 3
-; CHECK-DAG: stxvd2x 37, 3, [[C1:[0-9]+]]
-; CHECK-DAG: stxvd2x 34, 3, [[C2:[0-9]+]]
+; FIXME: We currently don't get this because of copy ordering on the MI level.
+; CHECX-DAG: xvmaddmdp 36, 35, 34
+; CHECX-DAG: xvmaddmdp 37, 35, 34
+; CHECX-DAG: li [[C1:[0-9]+]], 16
+; CHECX-DAG: li [[C2:[0-9]+]], 32
+; CHECX-DAG: xvmaddadp 34, 35, 38
+; CHECX-DAG: stxvd2x 36, 0, 3
+; CHECX-DAG: stxvd2x 37, 3, [[C1:[0-9]+]]
+; CHECX-DAG: stxvd2x 34, 3, [[C2:[0-9]+]]
 ; CHECK: blr
 
 ; CHECK-FISL-LABEL: @testv2
diff --git a/test/CodeGen/PowerPC/vsx-fma-sp.ll b/test/CodeGen/PowerPC/vsx-fma-sp.ll
index 1c3e457f92cb..b4dd2e1627c4 100644
--- a/test/CodeGen/PowerPC/vsx-fma-sp.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-sp.ll
@@ -42,12 +42,13 @@ entry:
 ; CHECK-LABEL: @test2sp
 ; CHECK-DAG: li [[C1:[0-9]+]], 4
 ; CHECK-DAG: li [[C2:[0-9]+]], 8
-; CHECK-DAG: xsmaddmsp 3, 2, 1
-; CHECK-DAG: xsmaddmsp 4, 2, 1
-; CHECK-DAG: xsmaddasp 1, 2, 5
-; CHECK-DAG: stxsspx 3, 0, 8
-; CHECK-DAG: stxsspx 4, 8, [[C1]]
-; CHECK-DAG: stxsspx 1, 8, [[C2]]
+; FIXME: We now miss this because of copy ordering at the MI level.
+; CHECX-DAG: xsmaddmsp 3, 2, 1
+; CHECX-DAG: xsmaddmsp 4, 2, 1
+; CHECX-DAG: xsmaddasp 1, 2, 5
+; CHECX-DAG: stxsspx 3, 0, 8
+; CHECX-DAG: stxsspx 4, 8, [[C1]]
+; CHECX-DAG: stxsspx 1, 8, [[C2]]
 ; CHECK: blr
 
 ; CHECK-FISL-LABEL: @test2sp
diff --git a/test/CodeGen/SPARC/basictest.ll b/test/CodeGen/SPARC/basictest.ll
index 7b540074a35f..3792100b2e63 100644
--- a/test/CodeGen/SPARC/basictest.ll
+++ b/test/CodeGen/SPARC/basictest.ll
@@ -38,7 +38,7 @@ entry:
 
 ; CHECK-LABEL: signed_divide:
 ; CHECK: sra %o0, 31, %o2
-; CHECK: wr %o2, %g0, %y
+; CHECK: wr %g0, %o2, %y
 ; CHECK: sdiv %o0, %o1, %o0
 define i32 @signed_divide(i32 %a, i32 %b) {
   %r = sdiv i32 %a, %b
diff --git a/test/CodeGen/SPARC/multiple-div.ll b/test/CodeGen/SPARC/multiple-div.ll
new file mode 100644
index 000000000000..6934f69ac18c
--- /dev/null
+++ b/test/CodeGen/SPARC/multiple-div.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march=sparc | FileCheck %s
+; RUN: llc -O0 < %s -march=sparc | FileCheck %s
+
+;; llc -O0 used to try to spill Y to the stack, which isn't possible,
+;; and then crashed. Additionally, in -O1, it would omit the second
+;; apparently-redundant wr to %y, which is not actually redundant
+;; because the spec says to treat %y as potentially-written by udiv.
+
+; CHECK-LABEL: two_divides:
+; CHECK: wr %g0, %g0, %y
+; CHECK: udiv
+; CHECK: wr %g0, %g0, %y
+; CHECK: udiv
+; CHECK: add
+
+define i32 @two_divides(i32 %a, i32 %b) {
+  %r = udiv i32 %a, %b
+  %r2 = udiv i32 %b, %a
+  %r3 = add i32 %r, %r2
+  ret i32 %r3
+}
diff --git a/test/CodeGen/Thumb2/aapcs.ll b/test/CodeGen/Thumb2/aapcs.ll
index 21af8c119b04..299562fe4c5c 100644
--- a/test/CodeGen/Thumb2/aapcs.ll
+++ b/test/CodeGen/Thumb2/aapcs.ll
@@ -33,8 +33,7 @@ define float @float_on_stack(double %a, double %b, double %c, double %d, double
 
 define double @double_on_stack(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) {
 ; CHECK-LABEL: double_on_stack:
-; SOFT: ldr r0, [sp, #48]
-; SOFT: ldr r1, [sp, #52]
+; SOFT: ldrd r0, r1, [sp, #48]
 ; HARD: vldr d0, [sp]
 ; CHECK-NEXT: bx lr
   ret double %i
@@ -42,8 +41,7 @@ define double @double_on_stack(double %a, double %b, double %c, double %d, doubl
 
 define double @double_not_split(double %a, double %b, double %c, double %d, double %e, double %f, double %g, float %h, double %i) {
 ; CHECK-LABEL: double_not_split:
-; SOFT: ldr r0, [sp, #48]
-; SOFT: ldr r1, [sp, #52]
+; SOFT: ldrd r0, r1, [sp, #48]
 ; HARD: vldr d0, [sp]
 ; CHECK-NEXT: bx lr
   ret double %i
diff --git a/test/CodeGen/WebAssembly/lit.local.cfg b/test/CodeGen/WebAssembly/lit.local.cfg
new file mode 100644
index 000000000000..743473517cd0
--- /dev/null
+++ b/test/CodeGen/WebAssembly/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'WebAssembly' in config.root.targets:
+    config.unsupported = True
diff --git a/test/CodeGen/WinEH/cppeh-alloca-sink.ll b/test/CodeGen/WinEH/cppeh-alloca-sink.ll
index cc6cec9e4d69..f215dca2ddd3 100644
--- a/test/CodeGen/WinEH/cppeh-alloca-sink.ll
+++ b/test/CodeGen/WinEH/cppeh-alloca-sink.ll
@@ -81,7 +81,7 @@ eh.resume:                                        ; preds = %lpad
 }
 
 ; CHECK-LABEL: define void @sink_alloca_to_catch()
-; CHECK: call void (...) @llvm.frameescape(i32* %only_used_in_catch)
+; CHECK: call void (...) @llvm.localescape(i32* %only_used_in_catch)
 
 declare void @use_catch_var(i32*) #1
 
@@ -162,14 +162,14 @@ eh.resume:                                        ; preds = %lpad1, %catch.dispa
 }
 
 ; CHECK-LABEL: define void @dont_sink_alloca_to_catch(i32 %n)
-; CHECK: call void (...) @llvm.frameescape(i32* %live_in_out_catch)
+; CHECK: call void (...) @llvm.localescape(i32* %live_in_out_catch)
 
 ; CHECK-LABEL: define internal i8* @sink_alloca_to_catch.catch(i8*, i8*)
-; CHECK: %only_used_in_catch.i8 = call i8* @llvm.framerecover({{.*}}, i32 0)
+; CHECK: %only_used_in_catch.i8 = call i8* @llvm.localrecover({{.*}}, i32 0)
 ; CHECK: %only_used_in_catch = bitcast
 
 ; CHECK-LABEL: define internal i8* @dont_sink_alloca_to_catch.catch(i8*, i8*)
-; CHECK: %live_in_out_catch.i8 = call i8* @llvm.framerecover({{.*}}, i32 0)
+; CHECK: %live_in_out_catch.i8 = call i8* @llvm.localrecover({{.*}}, i32 0)
 ; CHECK: %live_in_out_catch = bitcast
 
 
diff --git a/test/CodeGen/WinEH/cppeh-catch-all-win32.ll b/test/CodeGen/WinEH/cppeh-catch-all-win32.ll
new file mode 100644
index 000000000000..b2e84b90d69f
--- /dev/null
+++ b/test/CodeGen/WinEH/cppeh-catch-all-win32.ll
@@ -0,0 +1,86 @@
+; RUN: opt -winehprepare -S -o - < %s | FileCheck %s
+
+; This test is based on the following code:
+;
+; extern "C" void may_throw();
+; extern "C" void handle_exception();
+; extern "C" void test() {
+;   try {
+;     may_throw();
+;   } catch (...) {
+;     handle_exception();
+;   }
+; }
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+; The function entry in this case remains unchanged.
+; CHECK: define void @test()
+; CHECK: entry:
+; CHECK:   invoke void @may_throw()
+; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
+
+define void @test() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  invoke void @may_throw()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  br label %try.cont
+
+; CHECK: [[LPAD_LABEL]]:{{[ ]+}}; preds = %entry
+; CHECK:   landingpad { i8*, i32 }
+; CHECK-NEXT:           catch i8* null
+; CHECK-NEXT:   [[RECOVER:\%.+]] = call i8* (...) @llvm.eh.actions(i32 1, i8* null, i32 -1, i8* ()* @test.catch)
+; CHECK-NEXT:   indirectbr i8* [[RECOVER]], [label %try.cont]
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 }
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  store i8* %1, i8** %exn.slot
+  %2 = extractvalue { i8*, i32 } %0, 1
+  store i32 %2, i32* %ehselector.slot
+  br label %catch
+
+; CHECK-NOT: catch:
+; CHECK-NOT: @handle_exception()
+
+catch:                                            ; preds = %lpad
+  %exn = load i8*, i8** %exn.slot
+  call void @llvm.eh.begincatch(i8* %exn, i8* null) #1
+  call void @handle_exception()
+  call void @llvm.eh.endcatch() #1
+  br label %try.cont
+
+try.cont:                                         ; preds = %catch, %invoke.cont
+  ret void
+
+; CHECK: }
+}
+
+; CHECK: define internal i8* @test.catch()
+; CHECK:   call i8* @llvm.frameaddress(i32 1)
+; CHECK:   call i8* @llvm.x86.seh.recoverfp(i8* bitcast (void ()* @test to i8*), i8* %{{.*}})
+; CHECK:   call void @handle_exception()
+; CHECK:   ret i8* blockaddress(@test, %try.cont)
+; CHECK: }
+
+
+declare void @may_throw() #0
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind
+declare void @llvm.eh.begincatch(i8* nocapture, i8* nocapture) #1
+
+declare void @handle_exception() #0
+
+; Function Attrs: nounwind
+declare void @llvm.eh.endcatch() #1
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/WinEH/cppeh-catch-and-throw.ll b/test/CodeGen/WinEH/cppeh-catch-and-throw.ll
index 240ca987690d..d604b86deb35 100644
--- a/test/CodeGen/WinEH/cppeh-catch-and-throw.ll
+++ b/test/CodeGen/WinEH/cppeh-catch-and-throw.ll
@@ -45,7 +45,7 @@ $_TI1H = comdat any
 ; This is just a minimal check to verify that main was handled by WinEHPrepare.
 ; CHECK: define void @"\01?test@@YAXXZ"()
 ; CHECK: entry:
-; CHECK:   call void (...) @llvm.frameescape
+; CHECK:   call void (...) @llvm.localescape
 ; CHECK:   invoke void @_CxxThrowException
 ; CHECK: }
 
@@ -105,7 +105,7 @@ unreachable:                                      ; preds = %catch, %entry
 ;
 ; CHECK-LABEL: define internal void @"\01?test@@YAXXZ.cleanup"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   call i8* @llvm.framerecover
+; CHECK:   call i8* @llvm.localrecover
 ; CHECK:   call void @"\01??1Obj@@QEAA@XZ"
 ; CHECK:   invoke void @llvm.donothing()
 ; CHECK:           to label %[[SPLIT_LABEL:.+]] unwind label %[[LPAD_LABEL:.+]]
diff --git a/test/CodeGen/WinEH/cppeh-catch-scalar.ll b/test/CodeGen/WinEH/cppeh-catch-scalar.ll
index 172502cf73c8..3b5ab746d63c 100644
--- a/test/CodeGen/WinEH/cppeh-catch-scalar.ll
+++ b/test/CodeGen/WinEH/cppeh-catch-scalar.ll
@@ -24,7 +24,7 @@ target triple = "x86_64-pc-windows-msvc"
 ; CHECK: define void @_Z4testv()
 ; CHECK: entry:
 ; CHECK:   [[I_PTR:\%.+]] = alloca i32, align 4
-; CHECK:   call void (...) @llvm.frameescape(i32* [[I_PTR]])
+; CHECK:   call void (...) @llvm.localescape(i32* [[I_PTR]])
 ; CHECK:   invoke void @_Z9may_throwv()
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
 
@@ -96,7 +96,7 @@ eh.resume:                                        ; preds = %catch.dispatch
 
 ; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
+; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
 ; CHECK:   [[I_PTR1:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
 ; CHECK:   [[TMP:\%.+]] = load i32, i32* [[I_PTR1]], align 4
 ; CHECK:   call void @_Z10handle_inti(i32 [[TMP]])
diff --git a/test/CodeGen/WinEH/cppeh-catch-unwind.ll b/test/CodeGen/WinEH/cppeh-catch-unwind.ll
index 6fd70d84b2af..8fdda9bbc02a 100644
--- a/test/CodeGen/WinEH/cppeh-catch-unwind.ll
+++ b/test/CodeGen/WinEH/cppeh-catch-unwind.ll
@@ -36,7 +36,7 @@ $"\01??_R0H@8" = comdat any
 ; CHECK:   [[OBJ_PTR:\%.+]] = alloca %class.SomeClass
 ; CHECK:   [[TMP0:\%.+]] = alloca i32, align 4
 ; CHECK:   [[TMP1:\%.+]] = alloca i32, align 4
-; CHECK:   call void (...) @llvm.frameescape(i32* [[TMP1]], %class.SomeClass* [[OBJ_PTR]], i32* [[TMP0]])
+; CHECK:   call void (...) @llvm.localescape(i32* [[TMP1]], %class.SomeClass* [[OBJ_PTR]], i32* [[TMP0]])
 ; CHECK:   %call = invoke %class.SomeClass* @"\01??0SomeClass@@QEAA@XZ"(%class.SomeClass* %obj)
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
 
@@ -177,7 +177,7 @@ eh.resume:                                        ; preds = %catch.dispatch7
 
 ; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_TMP1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK:   [[RECOVER_TMP1:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   [[TMP1_PTR:\%.+]] = bitcast i8* [[RECOVER_TMP1]] to i32*
 ; CHECK:   call void @"\01?handle_exception@@YAXXZ"()
 ; CHECK:   ret i8* blockaddress(@"\01?test@@YAXXZ", %try.cont15)
@@ -185,7 +185,7 @@ eh.resume:                                        ; preds = %catch.dispatch7
 
 ; CHECK-LABEL: define internal void @"\01?test@@YAXXZ.cleanup"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_OBJ:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK:   [[RECOVER_OBJ:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
 ; CHECK:   [[OBJ_PTR:\%.+]] = bitcast i8* %obj.i8 to %class.SomeClass*
 ; CHECK:   call void @"\01??1SomeClass@@QEAA@XZ"(%class.SomeClass* [[OBJ_PTR]])
 ; CHECK:   ret void
@@ -193,7 +193,7 @@ eh.resume:                                        ; preds = %catch.dispatch7
 
 ; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch.1"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_TMP0:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK:   [[RECOVER_TMP0:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
 ; CHECK:   [[TMP0_PTR:\%.+]] = bitcast i8* [[RECOVER_TMP0]] to i32*
 ; CHECK:   invoke void @"\01?handle_exception@@YAXXZ"()
 ; CHECK:           to label %invoke.cont6 unwind label %[[LPAD5_LABEL:lpad[0-9]+]]
diff --git a/test/CodeGen/WinEH/cppeh-frame-vars.ll b/test/CodeGen/WinEH/cppeh-frame-vars.ll
index 1077ad0b8765..c2dbd8ecab60 100644
--- a/test/CodeGen/WinEH/cppeh-frame-vars.ll
+++ b/test/CodeGen/WinEH/cppeh-frame-vars.ll
@@ -58,7 +58,7 @@ $"\01??_R0H@8" = comdat any
 ; CHECK:   [[TMP:\%.+]] = bitcast %struct.SomeData* [[DATA_PTR]] to i8*
 ; CHECK:   call void @llvm.memset(i8* [[TMP]], i8 0, i64 8, i32 4, i1 false)
 ; CHECK:   store i32 0, i32* [[I_PTR]], align 4
-; CHECK:   call void (...) @llvm.frameescape(i32* [[E_PTR]], i32* [[NUMEXCEPTIONS_PTR]], [10 x i32]* [[EXCEPTIONVAL_PTR]], i32* [[I_PTR]], %struct.SomeData* [[DATA_PTR]])
+; CHECK:   call void (...) @llvm.localescape(i32* [[E_PTR]], i32* [[NUMEXCEPTIONS_PTR]], [10 x i32]* [[EXCEPTIONVAL_PTR]], i32* [[I_PTR]], %struct.SomeData* [[DATA_PTR]])
 ; CHECK:   br label %for.cond
 
 ; Function Attrs: uwtable
@@ -198,15 +198,15 @@ eh.resume:                                        ; preds = %catch.dispatch
 ; The following catch handler should be outlined.
 ; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_E:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK:   [[RECOVER_E:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   [[E_PTR1:\%.+]] = bitcast i8* [[RECOVER_E]] to i32*
-; CHECK:   [[RECOVER_NUMEXCEPTIONS:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK:   [[RECOVER_NUMEXCEPTIONS:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
 ; CHECK:   [[NUMEXCEPTIONS_PTR1:\%.+]] = bitcast i8* [[RECOVER_NUMEXCEPTIONS]] to i32*
-; CHECK:   [[RECOVER_EXCEPTIONVAL:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK:   [[RECOVER_EXCEPTIONVAL:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
 ; CHECK:   [[EXCEPTIONVAL_PTR1:\%.+]] = bitcast i8* [[RECOVER_EXCEPTIONVAL]] to [10 x i32]*
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 3)
+; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 3)
 ; CHECK:   [[I_PTR1:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; CHECK:   [[RECOVER_DATA:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 4)
+; CHECK:   [[RECOVER_DATA:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 4)
 ; CHECK:   [[DATA_PTR1:\%.+]] = bitcast i8* [[RECOVER_DATA]] to %struct.SomeData*
 ; CHECK:   [[TMP:\%.+]] = load i32, i32* [[E_PTR1]], align 4
 ; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[NUMEXCEPTIONS_PTR]], align 4
diff --git a/test/CodeGen/WinEH/cppeh-inalloca.ll b/test/CodeGen/WinEH/cppeh-inalloca.ll
index 3dc1348efffa..649c5e72e2dd 100644
--- a/test/CodeGen/WinEH/cppeh-inalloca.ll
+++ b/test/CodeGen/WinEH/cppeh-inalloca.ll
@@ -41,7 +41,7 @@ $"\01??_R0H@8" = comdat any
 ; CHECK:   [[RETVAL:\%.+]] = alloca i32, align 4
 ; CHECK:   [[E_PTR:\%.+]] = alloca i32, align 4
 ; CHECK:   [[CLEANUP_SLOT:\%.+]] = alloca i32
-; CHECK:   call void (...) @llvm.frameescape(i32* %e, <{ %struct.A }>** [[TMP_REGMEM]], i32* [[RETVAL]], i32* [[CLEANUP_SLOT]])
+; CHECK:   call void (...) @llvm.localescape(i32* %e, <{ %struct.A }>** [[TMP_REGMEM]], i32* [[RETVAL]], i32* [[CLEANUP_SLOT]])
 ; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
 
@@ -139,13 +139,13 @@ eh.resume:                                        ; preds = %ehcleanup
 ; The following catch handler should be outlined.
 ; CHECK: define internal i8* @"\01?test@@YAHUA@@@Z.catch"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_E:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 0)
+; CHECK:   [[RECOVER_E:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 0)
 ; CHECK:   [[E_PTR:\%.+]] = bitcast i8* [[RECOVER_E]] to i32*
-; CHECK:   [[RECOVER_EH_TEMP:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 1)
+; CHECK:   [[RECOVER_EH_TEMP:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 1)
 ; CHECK:   [[EH_TEMP:\%.+]] = bitcast i8* [[RECOVER_EH_TEMP]] to <{ %struct.A }>**
-; CHECK:   [[RECOVER_RETVAL:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 2)
+; CHECK:   [[RECOVER_RETVAL:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 2)
 ; CHECK:   [[RETVAL1:\%.+]] = bitcast i8* [[RECOVER_RETVAL]] to i32*
-; CHECK:   [[RECOVER_CLEANUPSLOT:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 3)
+; CHECK:   [[RECOVER_CLEANUPSLOT:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 3)
 ; CHECK:   [[CLEANUPSLOT1:\%.+]] = bitcast i8* [[RECOVER_CLEANUPSLOT]] to i32*
 ; CHECK:   [[E_I8PTR:\%.+]] = bitcast i32* [[E_PTR]] to i8*
 ; CHECK:   [[TMP_RELOAD:\%.+]] = load <{ %struct.A }>*, <{ %struct.A }>** [[EH_TEMP]]
@@ -162,7 +162,7 @@ eh.resume:                                        ; preds = %ehcleanup
 ; The following cleanup handler should be outlined.
 ; CHECK: define internal void @"\01?test@@YAHUA@@@Z.cleanup"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_EH_TEMP1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 1)
+; CHECK:   [[RECOVER_EH_TEMP1:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (i32 (<{ %struct.A }>*)* @"\01?test@@YAHUA@@@Z" to i8*), i8* %1, i32 1)
 ; CHECK:   [[EH_TEMP1:\%.+]] = bitcast i8* [[RECOVER_EH_TEMP]] to <{ %struct.A }>**
 ; CHECK:   [[TMP_RELOAD1:\%.+]] = load <{ %struct.A }>*, <{ %struct.A }>** [[EH_TEMP1]]
 ; CHECK:   [[A3:\%.+]] = getelementptr inbounds <{ %struct.A }>, <{ %struct.A }>* [[TMP_RELOAD1]], i32 0, i32 0
diff --git a/test/CodeGen/WinEH/cppeh-min-unwind.ll b/test/CodeGen/WinEH/cppeh-min-unwind.ll
index b1f157ade29b..98d6d6fcacb6 100644
--- a/test/CodeGen/WinEH/cppeh-min-unwind.ll
+++ b/test/CodeGen/WinEH/cppeh-min-unwind.ll
@@ -25,7 +25,7 @@ target triple = "x86_64-pc-windows-msvc"
 ; CHECK: entry:
 ; CHECK:   [[OBJ_PTR:\%.+]] = alloca %class.SomeClass, align 4
 ; CHECK:   call void @_ZN9SomeClassC1Ev(%class.SomeClass* [[OBJ_PTR]])
-; CHECK:   call void (...) @llvm.frameescape(%class.SomeClass* [[OBJ_PTR]])
+; CHECK:   call void (...) @llvm.localescape(%class.SomeClass* [[OBJ_PTR]])
 ; CHECK:   invoke void @_Z9may_throwv()
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
 
@@ -74,7 +74,7 @@ eh.resume:                                        ; preds = %lpad
 ; This cleanup handler should be outlined.
 ; CHECK: define internal void @_Z4testv.cleanup(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_OBJ:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
+; CHECK:   [[RECOVER_OBJ:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
 ; CHECK:   [[OBJ_PTR1:\%.+]] = bitcast i8* [[RECOVER_OBJ]] to %class.SomeClass*
 ; CHECK:   call void @_ZN9SomeClassD1Ev(%class.SomeClass* [[OBJ_PTR1]])
 ; CHECK:   ret void
diff --git a/test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll b/test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll
index 1294d0b8ff30..c69633f17e28 100644
--- a/test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll
+++ b/test/CodeGen/WinEH/cppeh-mixed-catch-and-cleanup.ll
@@ -31,7 +31,7 @@ target triple = "x86_64-pc-windows-msvc"
 ;
 ; CHECK-LABEL: define void @"\01?test@@YAXXZ"()
 ; CHECK: entry:
-; CHECK:   call void (...) @llvm.frameescape
+; CHECK:   call void (...) @llvm.localescape
 ; CHECK: }
 
 ; Function Attrs: nounwind uwtable
@@ -67,7 +67,7 @@ try.cont:                                         ; preds = %catch, %invoke.cont
 ; Verify that a cleanup handler was created and that it calls ~Obj().
 ; CHECK-LABEL: define internal void @"\01?test@@YAXXZ.cleanup"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK:   @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   call void @"\01??1Obj@@QEAA@XZ"
 ; CHECK:   ret void
 ; CHECK: }
diff --git a/test/CodeGen/WinEH/cppeh-multi-catch.ll b/test/CodeGen/WinEH/cppeh-multi-catch.ll
index 25224551cadc..266cdea20cdb 100644
--- a/test/CodeGen/WinEH/cppeh-multi-catch.ll
+++ b/test/CodeGen/WinEH/cppeh-multi-catch.ll
@@ -50,7 +50,7 @@ $"\01??_R0?AVSomeClass@@@8" = comdat any
 ; CHECK:   [[OBJ_PTR:\%.+]] = alloca %class.SomeClass*, align 8
 ; CHECK:   [[LL_PTR:\%.+]] = alloca i64, align 8
 ; CHECK:   [[I_PTR:\%.+]] = alloca i32, align 4
-; CHECK:   call void (...) @llvm.frameescape(i32* [[I_PTR]], i64* [[LL_PTR]], %class.SomeClass** [[OBJ_PTR]])
+; CHECK:   call void (...) @llvm.localescape(i32* [[I_PTR]], i64* [[LL_PTR]], %class.SomeClass** [[OBJ_PTR]])
 ; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
 
@@ -161,7 +161,7 @@ catch:                                            ; preds = %catch.fallthrough2
 
 ; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
 ; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
 ; CHECK:   call void @"\01?handle_int@@YAXH@Z"(i32 [[TMP1]])
@@ -170,7 +170,7 @@ catch:                                            ; preds = %catch.fallthrough2
 
 ; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch.1"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_LL:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK:   [[RECOVER_LL:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
 ; CHECK:   [[LL_PTR:\%.+]] = bitcast i8* [[RECOVER_LL]] to i64*
 ; CHECK:   [[TMP2:\%.+]] = load i64, i64* [[LL_PTR]], align 8
 ; CHECK:   call void @"\01?handle_long_long@@YAX_J@Z"(i64 [[TMP2]])
@@ -179,7 +179,7 @@ catch:                                            ; preds = %catch.fallthrough2
 
 ; CHECK-LABEL: define internal i8* @"\01?test@@YAXXZ.catch.2"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_OBJ:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK:   [[RECOVER_OBJ:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
 ; CHECK:   [[OBJ_PTR:\%.+]] = bitcast i8* [[RECOVER_OBJ]] to %class.SomeClass**
 ; CHECK:   [[TMP3:\%.+]] = load %class.SomeClass*, %class.SomeClass** [[OBJ_PTR]], align 8
 ; CHECK:   call void @"\01?handle_obj@@YAXPEAVSomeClass@@@Z"(%class.SomeClass* [[TMP3]])
diff --git a/test/CodeGen/WinEH/cppeh-nested-1.ll b/test/CodeGen/WinEH/cppeh-nested-1.ll
index a5e80ac2b2ab..d525d8a1a67e 100644
--- a/test/CodeGen/WinEH/cppeh-nested-1.ll
+++ b/test/CodeGen/WinEH/cppeh-nested-1.ll
@@ -34,7 +34,7 @@ $"\01??_R0H@8" = comdat any
 ; CHECK: entry:
 ; CHECK:   %i = alloca i32, align 4
 ; CHECK:   %f = alloca float, align 4
-; CHECK:   call void (...) @llvm.frameescape(float* %f, i32* %i)
+; CHECK:   call void (...) @llvm.localescape(float* %f, i32* %i)
 ; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
 
@@ -136,7 +136,7 @@ eh.resume:                                        ; %catch.dispatch3
 
 ; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_F1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK:   [[RECOVER_F1:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   [[F_PTR1:\%.+]] = bitcast i8* [[RECOVER_F1]] to float*
 ; CHECK:   [[TMP2:\%.+]] = load float, float* [[F_PTR1]], align 4
 ; CHECK:   call void @"\01?handle_float@@YAXM@Z"(float [[TMP2]])
@@ -145,7 +145,7 @@ eh.resume:                                        ; %catch.dispatch3
 
 ; CHECK: define internal i8* @"\01?test@@YAXXZ.catch.1"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
 ; CHECK:   [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
 ; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
 ; CHECK:   invoke void @"\01?handle_int@@YAXH@Z"(i32 [[TMP1]])
diff --git a/test/CodeGen/WinEH/cppeh-nested-2.ll b/test/CodeGen/WinEH/cppeh-nested-2.ll
index 385958b006d2..2764e7478c71 100644
--- a/test/CodeGen/WinEH/cppeh-nested-2.ll
+++ b/test/CodeGen/WinEH/cppeh-nested-2.ll
@@ -44,7 +44,7 @@ target triple = "x86_64-pc-windows-msvc"
 ; CHECK:   %inner = alloca %class.Inner, align 1
 ; CHECK:   %i = alloca i32, align 4
 ; CHECK:   %f = alloca float, align 4
-; CHECK:   call void (...) @llvm.frameescape(float* %f, i32* %i, %class.Outer* %outer, %class.Inner* %inner)
+; CHECK:   call void (...) @llvm.localescape(float* %f, i32* %i, %class.Outer* %outer, %class.Inner* %inner)
 ; CHECK:   invoke void @_ZN5OuterC1Ev(%class.Outer* %outer)
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
 
@@ -243,7 +243,7 @@ eh.resume:                                        ; preds = %catch.dispatch11
 ; This catch handler should be outlined.
 ; CHECK: define internal i8* @_Z4testv.catch(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_F:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
+; CHECK:   [[RECOVER_F:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 0)
 ; CHECK:   [[F_PTR:\%.+]] = bitcast i8* [[RECOVER_F]] to float*
 ; CHECK:   [[TMP:\%.+]] = load float, float* [[F_PTR]], align 4
 ; CHECK:   call void @_Z12handle_floatf(float [[TMP]])
@@ -253,7 +253,7 @@ eh.resume:                                        ; preds = %catch.dispatch11
 ; This catch handler should be outlined.
 ; CHECK: define internal i8* @_Z4testv.catch.1(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 1)
+; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 1)
 ; CHECK:   [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
 ; CHECK:   [[TMP1:\%.+]] = load i32, i32* [[I_PTR]], align 4
 ; CHECK:   invoke void @_Z10handle_inti(i32 [[TMP1]])
@@ -270,7 +270,7 @@ eh.resume:                                        ; preds = %catch.dispatch11
 ; This cleanup handler should be outlined.
 ; CHECK: define internal void @_Z4testv.cleanup(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_OUTER:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 2)
+; CHECK:   [[RECOVER_OUTER:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 2)
 ; CHECK:   [[OUTER_PTR:\%.+]] = bitcast i8* [[RECOVER_OUTER]] to %class.Outer*
 ; CHECK:   call void @_ZN5OuterD1Ev(%class.Outer* [[OUTER_PTR]])
 ; CHECK:   ret void
@@ -279,7 +279,7 @@ eh.resume:                                        ; preds = %catch.dispatch11
 ; This cleanup handler should be outlined.
 ; CHECK: define internal void @_Z4testv.cleanup.2(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_INNER:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 3)
+; CHECK:   [[RECOVER_INNER:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @_Z4testv to i8*), i8* %1, i32 3)
 ; CHECK:   [[INNER_PTR:\%.+]] = bitcast i8* [[RECOVER_INNER]] to %class.Inner*
 ; CHECK:   call void @_ZN5InnerD1Ev(%class.Inner* [[INNER_PTR]])
 ; CHECK:   ret void
diff --git a/test/CodeGen/WinEH/cppeh-nested-3.ll b/test/CodeGen/WinEH/cppeh-nested-3.ll
index 33faaf0f591a..88759f406fb1 100644
--- a/test/CodeGen/WinEH/cppeh-nested-3.ll
+++ b/test/CodeGen/WinEH/cppeh-nested-3.ll
@@ -41,7 +41,7 @@ $"\01??_R0H@8" = comdat any
 ; CHECK:   %i = alloca i32, align 4
 ; CHECK:   %j = alloca i32, align 4
 ; CHECK:   %f = alloca float, align 4
-; CHECK:   call void (...) @llvm.frameescape(i32* %j, i32* %i, float* %f)
+; CHECK:   call void (...) @llvm.localescape(i32* %j, i32* %i, float* %f)
 ; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
 ; CHECK:           to label %invoke.cont unwind label %[[LPAD_LABEL:lpad[0-9]*]]
 
@@ -181,9 +181,9 @@ eh.resume:                                        ; preds = %lpad16, %catch.disp
 
 ; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_J:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK:   [[RECOVER_J:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   [[J_PTR:\%.+]] = bitcast i8* [[RECOVER_J]] to i32*
-; CHECK:   [[RECOVER_I1:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK:   [[RECOVER_I1:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
 ; CHECK:   [[I_PTR1:\%.+]] = bitcast i8* [[RECOVER_I1]] to i32*
 ; CHECK:   [[TMP3:\%.+]] = load i32, i32* [[J_PTR]], align 4
 ; CHECK:   store i32 [[TMP3]], i32* [[I_PTR1]]
@@ -192,7 +192,7 @@ eh.resume:                                        ; preds = %lpad16, %catch.disp
 
 ; CHECK: define internal i8* @"\01?test@@YAXXZ.catch.1"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_F:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK:   [[RECOVER_F:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
 ; CHECK:   [[F_PTR:\%.+]] = bitcast i8* [[RECOVER_F]] to float*
 ; CHECK:   [[TMP2:\%.+]] = load float, float* [[F_PTR]], align 4
 ; CHECK:   call void @"\01?handle_float@@YAXM@Z"(float [[TMP2]])
@@ -201,7 +201,7 @@ eh.resume:                                        ; preds = %lpad16, %catch.disp
 
 ; CHECK: define internal i8* @"\01?test@@YAXXZ.catch.2"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
 ; CHECK:   [[I_PTR:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
 ; CHECK:   invoke void @"\01?may_throw@@YAXXZ"()
 ; CHECK:           to label %invoke.cont2 unwind label %[[LPAD1_LABEL:lpad[0-9]*]]
diff --git a/test/CodeGen/WinEH/cppeh-nested-rethrow.ll b/test/CodeGen/WinEH/cppeh-nested-rethrow.ll
index 14a5f233f9ba..53f532c8eb16 100644
--- a/test/CodeGen/WinEH/cppeh-nested-rethrow.ll
+++ b/test/CodeGen/WinEH/cppeh-nested-rethrow.ll
@@ -53,7 +53,7 @@ $_TI1H = comdat any
 
 ; CHECK-LABEL: define void @"\01?test1@@YAXXZ"()
 ; CHECK: entry:
-; CHECK:   call void (...) @llvm.frameescape
+; CHECK:   call void (...) @llvm.localescape
 
 ; Function Attrs: nounwind uwtable
 define void @"\01?test1@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
@@ -121,7 +121,7 @@ declare void @llvm.eh.endcatch() #1
 
 ; CHECK-LABEL: define void @"\01?test2@@YAXXZ"()
 ; CHECK: entry:
-; CHECK:   call void (...) @llvm.frameescape
+; CHECK:   call void (...) @llvm.localescape
 
 ; Function Attrs: nounwind uwtable
 define void @"\01?test2@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
diff --git a/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll b/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll
index 83236c4188ff..7b474c9d38a3 100644
--- a/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll
+++ b/test/CodeGen/WinEH/cppeh-nonalloca-frame-values.ll
@@ -68,7 +68,7 @@ $"\01??_R0H@8" = comdat any
 ; CHECK:   store i32* [[A_PTR]], i32** [[A_REGMEM]]
 ; CHECK:   [[B_PTR:\%.+]] = getelementptr inbounds %struct.SomeData, %struct.SomeData* [[TMPCAST]], i64 0, i32 1
 ; CHECK:   store i32* [[B_PTR]], i32** [[B_REGMEM]]
-; CHECK:   call void (...) @llvm.frameescape(i32* %e, i32* %NumExceptions.020.reg2mem, [10 x i32]* [[EXCEPTIONVAL]], i32* %inc.reg2mem, i32* [[I_REGMEM]], i32** [[A_REGMEM]], i32** [[B_REGMEM]])
+; CHECK:   call void (...) @llvm.localescape(i32* %e, i32* %NumExceptions.020.reg2mem, [10 x i32]* [[EXCEPTIONVAL]], i32* %inc.reg2mem, i32* [[I_REGMEM]], i32** [[A_REGMEM]], i32** [[B_REGMEM]])
 ; CHECK:   br label %for.body
 
 ; Function Attrs: uwtable
@@ -192,19 +192,19 @@ eh.resume:                                        ; preds = %lpad
 ; The following catch handler should be outlined.
 ; CHECK: define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*)
 ; CHECK: entry:
-; CHECK:   [[RECOVER_E:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+; CHECK:   [[RECOVER_E:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
 ; CHECK:   [[E_PTR:\%.+]] = bitcast i8* [[RECOVER_E]] to i32*
-; CHECK:   [[RECOVER_NUMEXCEPTIONS:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
+; CHECK:   [[RECOVER_NUMEXCEPTIONS:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 1)
 ; CHECK:   [[NUMEXCEPTIONS_REGMEM:\%.+]] = bitcast i8* [[RECOVER_NUMEXCEPTIONS]] to i32*
-; CHECK:   [[RECOVER_EXCEPTIONVAL:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+; CHECK:   [[RECOVER_EXCEPTIONVAL:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
 ; CHECK:   [[EXCEPTIONVAL:\%.+]] = bitcast i8* [[RECOVER_EXCEPTIONVAL]] to [10 x i32]*
-; CHECK:   [[RECOVER_INC:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 3)
+; CHECK:   [[RECOVER_INC:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 3)
 ; CHECK:   [[INC_REGMEM:\%.+]] = bitcast i8* [[RECOVER_INC]] to i32*
-; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 4)
+; CHECK:   [[RECOVER_I:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 4)
 ; CHECK:   [[I_REGMEM:\%.+]] = bitcast i8* [[RECOVER_I]] to i32*
-; CHECK:   [[RECOVER_A:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 5)
+; CHECK:   [[RECOVER_A:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 5)
 ; CHECK:   [[A_REGMEM:\%.+]] = bitcast i8* [[RECOVER_A]] to i32**
-; CHECK:   [[RECOVER_B:\%.+]] = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 6)
+; CHECK:   [[RECOVER_B:\%.+]] = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 6)
 ; CHECK:   [[B_REGMEM:\%.+]] = bitcast i8* [[RECOVER_B]] to i32**
 ; CHECK:   [[E_I8PTR:\%.+]] = bitcast i32* [[E_PTR]] to i8*
 ; CHECK:   [[TMP:\%.+]] = load i32, i32* [[E_PTR]], align 4
diff --git a/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll b/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll
index fc632af17405..2d31a1d5cf4f 100644
--- a/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll
+++ b/test/CodeGen/WinEH/cppeh-prepared-catch-reordered.ll
@@ -49,7 +49,7 @@ entry:
   %e = alloca i32, align 4
   %0 = bitcast i32* %tmp.i to i8*
   store i32 42, i32* %tmp.i, align 4, !tbaa !2
-  call void (...) @llvm.frameescape(i32* %e)
+  call void (...) @llvm.localescape(i32* %e)
   invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #6
           to label %.noexc unwind label %lpad1
 
@@ -92,7 +92,7 @@ declare i8* @llvm.eh.actions(...) #3
 
 define internal i8* @main.catch(i8*, i8*) #5 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
-  %e.i8 = call i8* @llvm.framerecover(i8* bitcast (i32 ()* @main to i8*), i8* %1, i32 0)
+  %e.i8 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %1, i32 0)
   %e = bitcast i8* %e.i8 to i32*
   %2 = bitcast i32* %e to i8*
   %3 = load i32, i32* %e, align 4, !tbaa !2
@@ -114,6 +114,7 @@ stub:                                             ; preds = %entry
 ; CHECK:        .seh_handlerdata
 ; CHECK:        .long   ($cppxdata$main)@IMGREL
 
+; CHECK: .align 4
 ; CHECK-NEXT: $cppxdata$main:
 ; CHECK-NEXT:         .long   429065506
 ; CHECK-NEXT:         .long   2
@@ -139,10 +140,10 @@ stub:                                             ; preds = %entry
 declare void @llvm.donothing() #2
 
 ; Function Attrs: nounwind
-declare void @llvm.frameescape(...) #3
+declare void @llvm.localescape(...) #3
 
 ; Function Attrs: nounwind readnone
-declare i8* @llvm.framerecover(i8*, i8*, i32) #2
+declare i8* @llvm.localrecover(i8*, i8*, i32) #2
 
 attributes #0 = { noreturn uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="main" }
diff --git a/test/CodeGen/WinEH/cppeh-prepared-catch.ll b/test/CodeGen/WinEH/cppeh-prepared-catch.ll
index 02cc682cbe4b..a5d86dceea93 100644
--- a/test/CodeGen/WinEH/cppeh-prepared-catch.ll
+++ b/test/CodeGen/WinEH/cppeh-prepared-catch.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s | FileCheck %s
-
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc"
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X86
 
 ; This test case is equivalent to:
 ; void f() {
@@ -32,7 +30,7 @@ $"\01??_R0H@8" = comdat any
 
 define internal i8* @"\01?f@@YAXXZ.catch"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
-  %.i8 = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?f@@YAXXZ" to i8*), i8* %1, i32 0)
+  %.i8 = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?f@@YAXXZ" to i8*), i8* %1, i32 0)
   %bc2 = bitcast i8* %.i8 to i32**
   %bc3 = bitcast i32** %bc2 to i8*
   invoke void @"\01?may_throw@@YAXXZ"()
@@ -51,14 +49,14 @@ lpad1:                                            ; preds = %entry
 
 ; CHECK-LABEL: "?f@@YAXXZ.catch":
 ; No code should be generated for the indirectbr.
-; CHECK-NOT: jmpq *
-; CHECK:        .seh_handlerdata
-; CHECK:        .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
+; CHECK-NOT: jmp{{[ql]}} *
+; X64:        .seh_handlerdata
+; X64-NEXT:        .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
 
 
 define internal i8* @"\01?f@@YAXXZ.catch1"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
-  %.i8 = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?f@@YAXXZ" to i8*), i8* %1, i32 1)
+  %.i8 = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?f@@YAXXZ" to i8*), i8* %1, i32 1)
   %2 = bitcast i8* %.i8 to double*
   %3 = bitcast double* %2 to i8*
   invoke void () @llvm.donothing()
@@ -76,11 +74,11 @@ lpad:                                             ; preds = %entry
 
 ; CHECK-LABEL: "?f@@YAXXZ.catch1":
 ; No code should be generated for the indirectbr.
-; CHECK-NOT: jmpq *
-; CHECK: ".L?f@@YAXXZ.catch1$parent_frame_offset" = 16
-; CHECK:         movq    %rdx, 16(%rsp)
-; CHECK:        .seh_handlerdata
-; CHECK:        .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
+; CHECK-NOT: jmp{{[ql]}} *
+; X64: ".L?f@@YAXXZ.catch1$parent_frame_offset" = 16
+; X64:         movq    %rdx, 16(%rsp)
+; X64:        .seh_handlerdata
+; X64:        .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
 
 define void @"\01?f@@YAXXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
@@ -88,7 +86,7 @@ entry:
   %ehselector.slot = alloca i32
   %0 = alloca i32*, align 8
   %1 = alloca double, align 8
-  call void (...) @llvm.frameescape(i32** %0, double* %1)
+  call void (...) @llvm.localescape(i32** %0, double* %1)
   invoke void @"\01?may_throw@@YAXXZ"()
           to label %invoke.cont unwind label %lpad2
 
@@ -118,20 +116,38 @@ try.cont8:                                        ; preds = %lpad2, %try.cont
 
 ; CHECK-LABEL: "?f@@YAXXZ":
 ; No code should be generated for the indirectbr.
-; CHECK-NOT: jmpq *
-; CHECK:             .seh_handlerdata
-; CHECK-NEXT:        .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
-; CHECK-NEXT:"$cppxdata$?f@@YAXXZ":
-; CHECK-NEXT:        .long   429065506
-; CHECK-NEXT:        .long   4
-; CHECK-NEXT:        .long   ("$stateUnwindMap$?f@@YAXXZ")@IMGREL
-; CHECK-NEXT:        .long   2
-; CHECK-NEXT:        .long   ("$tryMap$?f@@YAXXZ")@IMGREL
-; CHECK-NEXT:        .long   6
-; CHECK-NEXT:        .long   ("$ip2state$?f@@YAXXZ")@IMGREL
-; CHECK-NEXT:        .long   32
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   1
+; CHECK-NOT: jmp{{[ql]}} *
+
+; X64:             .seh_handlerdata
+; X64-NEXT:        .long   ("$cppxdata$?f@@YAXXZ")@IMGREL
+; X86:             .section .xdata,"dr"
+
+; CHECK: .align 4
+
+; X64: "$cppxdata$?f@@YAXXZ":
+; X64-NEXT:          .long   429065506
+; X64-NEXT:          .long   4
+; X64-NEXT:          .long   ("$stateUnwindMap$?f@@YAXXZ")@IMGREL
+; X64-NEXT:          .long   2
+; X64-NEXT:          .long   ("$tryMap$?f@@YAXXZ")@IMGREL
+; X64-NEXT:          .long   6
+; X64-NEXT:          .long   ("$ip2state$?f@@YAXXZ")@IMGREL
+; X64-NEXT:          .long   32
+; X64-NEXT:          .long   0
+; X64-NEXT:          .long   1
+
+; X86: "L__ehtable$?f@@YAXXZ":
+; X86-NEXT:          .long   429065506
+; X86-NEXT:          .long   4
+; X86-NEXT:          .long   ("$stateUnwindMap$?f@@YAXXZ")
+; X86-NEXT:          .long   2
+; X86-NEXT:          .long   ("$tryMap$?f@@YAXXZ")
+; X86-NEXT:          .long   0
+; X86-NEXT:          .long   0
+; X86-NEXT:          .long   0
+; X86-NEXT:          .long   1
+
+
 ; CHECK-NEXT:"$stateUnwindMap$?f@@YAXXZ":
 ; CHECK-NEXT:        .long   -1
 ; CHECK-NEXT:        .long   0
@@ -146,37 +162,43 @@ try.cont8:                                        ; preds = %lpad2, %try.cont
 ; CHECK-NEXT:        .long   1
 ; CHECK-NEXT:        .long   2
 ; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   ("$handlerMap$0$?f@@YAXXZ")@IMGREL
+; CHECK-NEXT:        .long   ("$handlerMap$0$?f@@YAXXZ")
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   2
 ; CHECK-NEXT:        .long   3
 ; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   ("$handlerMap$1$?f@@YAXXZ")@IMGREL
+; CHECK-NEXT:        .long   ("$handlerMap$1$?f@@YAXXZ")
 ; CHECK-NEXT:"$handlerMap$0$?f@@YAXXZ":
 ; CHECK-NEXT:        .long   8
-; CHECK-NEXT:        .long   "??_R0H@8"@IMGREL
-; CHECK-NEXT:        .long   ".L?f@@YAXXZ$frame_escape_0"
-; CHECK-NEXT:        .long   "?f@@YAXXZ.catch"@IMGREL
-; CHECK-NEXT:        .long   ".L?f@@YAXXZ.catch$parent_frame_offset"
+; CHECK-NEXT:        .long   "??_R0H@8"
+; CHECK-NEXT:        .long   "{{.?}}L?f@@YAXXZ$frame_escape_0"
+; CHECK-NEXT:        .long   "?f@@YAXXZ.catch"
+; X64-NEXT:          .long   ".L?f@@YAXXZ.catch$parent_frame_offset"
 ; CHECK-NEXT:"$handlerMap$1$?f@@YAXXZ":
 ; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   "??_R0N@8"@IMGREL
-; CHECK-NEXT:        .long   ".L?f@@YAXXZ$frame_escape_1"
-; CHECK-NEXT:        .long   "?f@@YAXXZ.catch1"@IMGREL
-; CHECK-NEXT:        .long   ".L?f@@YAXXZ.catch1$parent_frame_offset"
-; CHECK-NEXT:"$ip2state$?f@@YAXXZ":
-; CHECK-NEXT:        .long   .Lfunc_begin0@IMGREL
-; CHECK-NEXT:        .long   2
-; CHECK-NEXT:        .long   .Ltmp0@IMGREL
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   .Lfunc_begin1@IMGREL
-; CHECK-NEXT:        .long   3
-; CHECK-NEXT:        .long   .Lfunc_begin2@IMGREL
-; CHECK-NEXT:        .long   -1
-; CHECK-NEXT:        .long   .Ltmp13@IMGREL
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   .Ltmp16@IMGREL
-; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   "??_R0N@8"
+; CHECK-NEXT:        .long   "{{.?}}L?f@@YAXXZ$frame_escape_1"
+; CHECK-NEXT:        .long   "?f@@YAXXZ.catch1"
+; X64-NEXT:          .long   ".L?f@@YAXXZ.catch1$parent_frame_offset"
+
+; X64-NEXT:"$ip2state$?f@@YAXXZ":
+; X64-NEXT:        .long   .Lfunc_begin0
+; X64-NEXT:        .long   2
+; X64-NEXT:        .long   .Ltmp0
+; X64-NEXT:        .long   0
+; X64-NEXT:        .long   .Lfunc_begin1
+; X64-NEXT:        .long   3
+; X64-NEXT:        .long   .Lfunc_begin2
+; X64-NEXT:        .long   -1
+; X64-NEXT:        .long   .Ltmp13
+; X64-NEXT:        .long   1
+; X64-NEXT:        .long   .Ltmp16
+; X64-NEXT:        .long   0
+
+
+; X86: "___ehhandler$?f@@YAXXZ": # @"__ehhandler$?f@@YAXXZ"
+; X86: movl $"L__ehtable$?f@@YAXXZ", %eax
+; X86: jmp ___CxxFrameHandler3 # TAILCALL
 
 
 declare void @"\01?may_throw@@YAXXZ"() #1
@@ -196,10 +218,10 @@ declare void @llvm.eh.endcatch() #3
 declare i8* @llvm.eh.actions(...) #3
 
 ; Function Attrs: nounwind
-declare void @llvm.frameescape(...) #3
+declare void @llvm.localescape(...) #3
 
 ; Function Attrs: nounwind readnone
-declare i8* @llvm.framerecover(i8*, i8*, i32) #2
+declare i8* @llvm.localrecover(i8*, i8*, i32) #2
 
 declare void @llvm.donothing()
 
diff --git a/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll b/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll
index 14973023356a..b5cfd65030ab 100644
--- a/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll
+++ b/test/CodeGen/WinEH/cppeh-prepared-cleanups.ll
@@ -30,6 +30,7 @@ $_TI1H = comdat any
 ; CHECK-LABEL: "?test1@@YAXXZ":
 ; CHECK:             .seh_handlerdata
 ; CHECK-NEXT:        .long   ("$cppxdata$?test1@@YAXXZ")@IMGREL
+; CHECK-NEXT: .align 4
 ; CHECK-NEXT:"$cppxdata$?test1@@YAXXZ":
 ; CHECK-NEXT:        .long   429065506
 ; CHECK-NEXT:        .long   1
@@ -58,7 +59,7 @@ entry:
   %ehselector.slot = alloca i32
   store i32 0, i32* %tmp
   %0 = bitcast i32* %tmp to i8*
-  call void (...) @llvm.frameescape()
+  call void (...) @llvm.localescape()
   store volatile i64 -2, i64* %unwindhelp
   %1 = bitcast i64* %unwindhelp to i8*
   call void @llvm.eh.unwindhelp(i8* %1)
@@ -92,6 +93,7 @@ entry:
 ; CHECK-LABEL: "?test2@@YAX_N@Z":
 ; CHECK:             .seh_handlerdata
 ; CHECK-NEXT:        .long   ("$cppxdata$?test2@@YAX_N@Z")@IMGREL
+; CHECK-NEXT: .align 4
 ; CHECK-NEXT:"$cppxdata$?test2@@YAX_N@Z":
 ; CHECK-NEXT:        .long   429065506
 ; CHECK-NEXT:        .long   2
@@ -126,7 +128,7 @@ define void @"\01?test2@@YAX_N@Z"(i1 zeroext %b) #2 personality i8* bitcast (i32
   %s1 = alloca %struct.S, align 1
   %frombool = zext i1 %b to i8
   store i8 %frombool, i8* %b.addr, align 1
-  call void (...) @llvm.frameescape(%struct.S* %s, %struct.S* %s1)
+  call void (...) @llvm.localescape(%struct.S* %s, %struct.S* %s1)
   call void @"\01?may_throw@@YAXXZ"()
   invoke void @"\01?may_throw@@YAXXZ"()
           to label %invoke.cont unwind label %lpad1
@@ -188,17 +190,17 @@ entry:
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.frameescape(...) #4
+declare void @llvm.localescape(...) #4
 
 ; Function Attrs: nounwind readnone
-declare i8* @llvm.framerecover(i8*, i8*, i32) #6
+declare i8* @llvm.localrecover(i8*, i8*, i32) #6
 
 ; Function Attrs: nounwind
 declare void @llvm.eh.unwindhelp(i8*) #4
 
 define internal void @"\01?test2@@YAX_N@Z.cleanup"(i8*, i8*) #7 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
-  %s.i8 = call i8* @llvm.framerecover(i8* bitcast (void (i1)* @"\01?test2@@YAX_N@Z" to i8*), i8* %1, i32 0)
+  %s.i8 = call i8* @llvm.localrecover(i8* bitcast (void (i1)* @"\01?test2@@YAX_N@Z" to i8*), i8* %1, i32 0)
   %s = bitcast i8* %s.i8 to %struct.S*
   call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s) #4
   invoke void @llvm.donothing()
@@ -215,7 +217,7 @@ stub:                                             ; preds = %entry
 
 define internal void @"\01?test2@@YAX_N@Z.cleanup1"(i8*, i8*) #7 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
-  %s1.i8 = call i8* @llvm.framerecover(i8* bitcast (void (i1)* @"\01?test2@@YAX_N@Z" to i8*), i8* %1, i32 1)
+  %s1.i8 = call i8* @llvm.localrecover(i8* bitcast (void (i1)* @"\01?test2@@YAX_N@Z" to i8*), i8* %1, i32 1)
   %s1 = bitcast i8* %s1.i8 to %struct.S*
   call void @"\01??_DS@@QEAA@XZ"(%struct.S* %s1) #4
   invoke void @llvm.donothing()
diff --git a/test/CodeGen/WinEH/cppeh-shared-empty-catch.ll b/test/CodeGen/WinEH/cppeh-shared-empty-catch.ll
index 678ea6f8ba13..87ccc9d9dedd 100644
--- a/test/CodeGen/WinEH/cppeh-shared-empty-catch.ll
+++ b/test/CodeGen/WinEH/cppeh-shared-empty-catch.ll
@@ -30,7 +30,7 @@ $"\01??_R0H@8" = comdat any
 
 ; CHECK-LABEL: define void @"\01?f@@YAXXZ"()
 ; CHECK: entry:
-; CHECK:   call void (...) @llvm.frameescape()
+; CHECK:   call void (...) @llvm.localescape()
 ; CHECK:   invoke void @"\01?g@@YAXXZ"()
 
 ; Function Attrs: nounwind
diff --git a/test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll b/test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll
index 5b974508bc11..092135368158 100644
--- a/test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll
+++ b/test/CodeGen/WinEH/cppeh-similar-catch-blocks.ll
@@ -86,7 +86,7 @@ $"\01??_C@_03PMGGPEJJ@?$CFd?6?$AA@" = comdat any
 ; This is just a minimal check to verify that main was handled by WinEHPrepare.
 ; CHECK: define i32 @main()
 ; CHECK: entry:
-; CHECK:   call void (...) @llvm.frameescape(i32* [[X_PTR:\%.+]], i32* [[X2_PTR:\%.+]], i8* [[C2_PTR:\%.+]], i8* [[C3_PTR:\%.+]], i8* [[C_PTR:\%.+]])
+; CHECK:   call void (...) @llvm.localescape(i32* [[X_PTR:\%.+]], i32* [[X2_PTR:\%.+]], i8* [[C2_PTR:\%.+]], i8* [[C3_PTR:\%.+]], i8* [[C_PTR:\%.+]])
 ; CHECK:   invoke void @_CxxThrowException
 ; CHECK: }
 
diff --git a/test/CodeGen/WinEH/cppeh-state-calc-1.ll b/test/CodeGen/WinEH/cppeh-state-calc-1.ll
index 1e71f8f38271..abc5d5292cf7 100644
--- a/test/CodeGen/WinEH/cppeh-state-calc-1.ll
+++ b/test/CodeGen/WinEH/cppeh-state-calc-1.ll
@@ -79,7 +79,7 @@ entry:
   call void @"\01?two@@YAXXZ"() #3
   store i32 2, i32* %tmp
   %0 = bitcast i32* %tmp to i8*
-  call void (...) @llvm.frameescape(i32* %x, i8* %c, i32* %x21)
+  call void (...) @llvm.localescape(i32* %x, i8* %c, i32* %x21)
   invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #5
           to label %unreachable unwind label %lpad
 
@@ -166,7 +166,7 @@ declare i8* @llvm.eh.actions(...) #3
 
 define internal i8* @"\01?test@@YAXXZ.catch"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
-  %x.i8 = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
+  %x.i8 = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 0)
   %x = bitcast i8* %x.i8 to i32*
   %2 = bitcast i32* %x to i8*
   call void @"\01?catch_two@@YAXXZ"() #3
@@ -204,7 +204,7 @@ stub:                                             ; preds = %entry
 
 define internal i8* @"\01?test@@YAXXZ.catch2"(i8*, i8*) #4 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
-  %x21.i8 = call i8* @llvm.framerecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
+  %x21.i8 = call i8* @llvm.localrecover(i8* bitcast (void ()* @"\01?test@@YAXXZ" to i8*), i8* %1, i32 2)
   %x21 = bitcast i8* %x21.i8 to i32*
   %2 = bitcast i32* %x21 to i8*
   call void @"\01?catch_one@@YAXXZ"() #3
@@ -238,10 +238,10 @@ stub:                                             ; preds = %entry
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.frameescape(...) #3
+declare void @llvm.localescape(...) #3
 
 ; Function Attrs: nounwind readnone
-declare i8* @llvm.framerecover(i8*, i8*, i32) #2
+declare i8* @llvm.localrecover(i8*, i8*, i32) #2
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="?test@@YAXXZ" }
 attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/WinEH/seh-exception-code.ll b/test/CodeGen/WinEH/seh-exception-code.ll
new file mode 100644
index 000000000000..2998e7982133
--- /dev/null
+++ b/test/CodeGen/WinEH/seh-exception-code.ll
@@ -0,0 +1,66 @@
+; RUN: opt -winehprepare -S < %s | FileCheck %s
+
+; WinEHPrepare was crashing during phi demotion.
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc18.0.0"
+
+declare i32 @__C_specific_handler(...)
+
+@str = linkonce_odr unnamed_addr constant [16 x i8] c"caught it! %lx\0A\00", align 1
+
+; Function Attrs: nounwind uwtable
+declare void @maycrash()
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, ...)
+
+; Function Attrs: nounwind uwtable
+define void @doit() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
+entry:
+  invoke void @maycrash()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  invoke void @maycrash()
+          to label %__try.cont unwind label %lpad.1
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 }
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  br label %__except
+
+lpad.1:                                           ; preds = %invoke.cont, %lpad
+  %2 = landingpad { i8*, i32 }
+          catch i8* null
+  %3 = extractvalue { i8*, i32 } %2, 0
+  br label %__except
+
+__except:                                         ; preds = %lpad, %lpad.1
+  %exn.slot.0 = phi i8* [ %3, %lpad.1 ], [ %1, %lpad ]
+  %4 = ptrtoint i8* %exn.slot.0 to i64
+  %5 = trunc i64 %4 to i32
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @str, i64 0, i64 0), i32 %5)
+  br label %__try.cont
+
+__try.cont:                                       ; preds = %invoke.cont, %__except
+  ret void
+}
+
+; CHECK-LABEL: define void @doit()
+; CHECK: landingpad
+; CHECK: indirectbr i8* %{{[^,]*}}, [label %[[except_split1:.*]]]
+; CHECK: [[except_split1]]:
+; CHECK: call i32 @llvm.eh.exceptioncode()
+; CHECK: br label %__except
+;
+; CHECK: landingpad
+; CHECK: indirectbr i8* %{{[^,]*}}, [label %[[except_split2:.*]]]
+; CHECK: [[except_split2]]:
+; CHECK: call i32 @llvm.eh.exceptioncode()
+; CHECK: br label %__except
+;
+; CHECK: __except:
+; CHECK: phi
+; CHECK: call i32 (i8*, ...) @printf
diff --git a/test/CodeGen/WinEH/seh-exception-code2.ll b/test/CodeGen/WinEH/seh-exception-code2.ll
new file mode 100644
index 000000000000..0356956502c0
--- /dev/null
+++ b/test/CodeGen/WinEH/seh-exception-code2.ll
@@ -0,0 +1,91 @@
+; RUN: opt -winehprepare -S < %s | FileCheck %s
+
+; WinEHPrepare was crashing during phi demotion.
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc18.0.0"
+
+declare i32 @__C_specific_handler(...)
+
+@str = linkonce_odr unnamed_addr constant [16 x i8] c"caught it! %lx\0A\00", align 1
+
+declare void @maycrash()
+declare void @finally(i1 %abnormal)
+declare i32 @printf(i8* nocapture readonly, ...)
+declare i32 @llvm.eh.typeid.for(i8*)
+
+; Function Attrs: nounwind uwtable
+define void @doit() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
+entry:
+  invoke void @maycrash()
+          to label %invoke.cont unwind label %lpad.1
+
+invoke.cont:                                      ; preds = %entry
+  invoke void @maycrash()
+          to label %__try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %lp0 = landingpad { i8*, i32 }
+          cleanup
+          catch i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@doit@@" to i8*)
+  %ehptr.0 = extractvalue { i8*, i32 } %lp0, 0
+  %ehsel.0 = extractvalue { i8*, i32 } %lp0, 1
+  call void @finally(i1 true)
+  br label %ehdispatch
+
+lpad.1:                                           ; preds = %invoke.cont, %lpad
+  %lp1 = landingpad { i8*, i32 }
+          catch i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@doit@@" to i8*)
+  %ehptr.1 = extractvalue { i8*, i32 } %lp1, 0
+  %ehsel.1 = extractvalue { i8*, i32 } %lp1, 1
+  br label %ehdispatch
+
+ehdispatch:
+  %ehptr.2 = phi i8* [ %ehptr.0, %lpad ], [ %ehptr.1, %lpad.1 ]
+  %ehsel.2 = phi i32 [ %ehsel.0, %lpad ], [ %ehsel.1, %lpad.1 ]
+  %mysel = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0@0@doit@@" to i8*))
+  %matches = icmp eq i32 %ehsel.2, %mysel
+  br i1 %matches, label %__except, label %eh.resume
+
+__except:                                         ; preds = %lpad, %lpad.1
+  %t4 = ptrtoint i8* %ehptr.2 to i64
+  %t5 = trunc i64 %t4 to i32
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @str, i64 0, i64 0), i32 %t5)
+  br label %__try.cont
+
+__try.cont:                                       ; preds = %invoke.cont, %__except
+  call void @finally(i1 false)
+  ret void
+
+eh.resume:
+  %ehvals0 = insertvalue { i8*, i32 } undef, i8* %ehptr.2, 0
+  %ehvals = insertvalue { i8*, i32 } %ehvals0, i32 %ehsel.2, 1
+  resume { i8*, i32 } %ehvals
+}
+
+define internal i32 @"\01?filt$0@0@doit@@"(i8* %exception_pointers, i8* %frame_pointer) #1 {
+entry:
+  %0 = bitcast i8* %exception_pointers to { i32*, i8* }*
+  %1 = getelementptr inbounds { i32*, i8* }, { i32*, i8* }* %0, i32 0, i32 0
+  %2 = load i32*, i32** %1
+  %3 = load i32, i32* %2
+  %cmp = icmp eq i32 %3, -1073741819
+  %4 = zext i1 %cmp to i32
+  ret i32 %4
+}
+
+; CHECK-LABEL: define void @doit()
+; CHECK: %lp0 = landingpad { i8*, i32 }
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: catch i8*
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions({{.*}})
+; CHECK-NEXT: indirectbr i8* %{{[^,]*}}, [label %__except]
+;
+; CHECK: %lp1 = landingpad { i8*, i32 }
+; CHECK-NEXT: catch i8*
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions({{.*}})
+; CHECK-NEXT: indirectbr i8* %{{[^,]*}}, [label %__except]
+;
+; CHECK: __except:
+; CHECK: call i32 @llvm.eh.exceptioncode()
+; CHECK: call i32 (i8*, ...) @printf
diff --git a/test/CodeGen/WinEH/seh-inlined-finally.ll b/test/CodeGen/WinEH/seh-inlined-finally.ll
index 5943cb77cee2..157adf0c8183 100644
--- a/test/CodeGen/WinEH/seh-inlined-finally.ll
+++ b/test/CodeGen/WinEH/seh-inlined-finally.ll
@@ -13,9 +13,9 @@ target triple = "x86_64-pc-windows-msvc"
 declare i32 @puts(i8*)
 declare void @may_crash()
 declare i32 @__C_specific_handler(...)
-declare i8* @llvm.framerecover(i8*, i8*, i32) #1
-declare i8* @llvm.frameaddress(i32)
-declare void @llvm.frameescape(...)
+declare i8* @llvm.localrecover(i8*, i8*, i32) #1
+declare i8* @llvm.localaddress()
+declare void @llvm.localescape(...)
 declare dllimport void @EnterCriticalSection(%struct._RTL_CRITICAL_SECTION*)
 declare dllimport void @LeaveCriticalSection(%struct._RTL_CRITICAL_SECTION*)
 
@@ -47,14 +47,14 @@ lpad:                                             ; preds = %entry
 define i32 @call_may_crash_locked() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
 entry:
   %p = alloca %struct._RTL_CRITICAL_SECTION, align 8
-  call void (...) @llvm.frameescape(%struct._RTL_CRITICAL_SECTION* %p)
+  call void (...) @llvm.localescape(%struct._RTL_CRITICAL_SECTION* %p)
   call void @EnterCriticalSection(%struct._RTL_CRITICAL_SECTION* %p)
   invoke void @may_crash()
           to label %invoke.cont unwind label %lpad
 
 invoke.cont:                                      ; preds = %entry
-  %tmp2 = call i8* @llvm.frameaddress(i32 0)
-  %tmp3 = call i8* @llvm.framerecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %tmp2, i32 0) #2
+  %tmp2 = call i8* @llvm.localaddress()
+  %tmp3 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %tmp2, i32 0) #2
   %tmp6 = bitcast i8* %tmp3 to %struct._RTL_CRITICAL_SECTION*
   call void @LeaveCriticalSection(%struct._RTL_CRITICAL_SECTION* %tmp6)
   ret i32 42
@@ -62,8 +62,8 @@ invoke.cont:                                      ; preds = %entry
 lpad:                                             ; preds = %entry
   %tmp7 = landingpad { i8*, i32 }
             cleanup
-  %tmp8 = call i8* @llvm.frameaddress(i32 0)
-  %tmp9 = call i8* @llvm.framerecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %tmp8, i32 0)
+  %tmp8 = call i8* @llvm.localaddress()
+  %tmp9 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %tmp8, i32 0)
   %tmp12 = bitcast i8* %tmp9 to %struct._RTL_CRITICAL_SECTION*
   call void @LeaveCriticalSection(%struct._RTL_CRITICAL_SECTION* %tmp12)
   resume { i8*, i32 } %tmp7
@@ -78,6 +78,6 @@ lpad:                                             ; preds = %entry
 ; CHECK-NEXT: indirectbr i8* %recover, []
 
 ; CHECK-LABEL: define internal void @call_may_crash_locked.cleanup(i8*, i8*)
-; CHECK: %tmp9 = call i8* @llvm.framerecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %1, i32 0)
+; CHECK: %tmp9 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @call_may_crash_locked to i8*), i8* %1, i32 0)
 ; CHECK: %tmp12 = bitcast i8* %tmp9 to %struct._RTL_CRITICAL_SECTION*
 ; CHECK: call void @LeaveCriticalSection(%struct._RTL_CRITICAL_SECTION* %tmp12)
diff --git a/test/CodeGen/WinEH/seh-outlined-finally-win32.ll b/test/CodeGen/WinEH/seh-outlined-finally-win32.ll
new file mode 100644
index 000000000000..3649433c4b61
--- /dev/null
+++ b/test/CodeGen/WinEH/seh-outlined-finally-win32.ll
@@ -0,0 +1,172 @@
+; RUN: opt -S -winehprepare < %s | FileCheck %s
+
+; Test case based on this code:
+;
+; extern "C" int _abnormal_termination();
+; #pragma intrinsic(_abnormal_termination)
+; extern "C" int printf(const char *, ...);
+; extern "C" void may_crash() {
+;   *(volatile int *)0 = 42;
+; }
+; int main() {
+;   int myres = 0;
+;   __try {
+;     __try {
+;       may_crash();
+;     } __finally {
+;       printf("inner finally %d\n", _abnormal_termination());
+;       may_crash();
+;     }
+;   } __finally {
+;     printf("outer finally %d\n", _abnormal_termination());
+;   }
+; }
+;
+; Note that if the inner finally crashes, the outer finally still runs. There
+; is nothing like a std::terminate call in this situation.
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+$"\01??_C@_0BC@LHHILCPN@outer?5finally?5?$CFd?6?$AA@" = comdat any
+
+$"\01??_C@_0BC@JELAHKN@inner?5finally?5?$CFd?6?$AA@" = comdat any
+
+@"\01??_C@_0BC@LHHILCPN@outer?5finally?5?$CFd?6?$AA@" = linkonce_odr unnamed_addr constant [18 x i8] c"outer finally %d\0A\00", comdat, align 1
+@"\01??_C@_0BC@JELAHKN@inner?5finally?5?$CFd?6?$AA@" = linkonce_odr unnamed_addr constant [18 x i8] c"inner finally %d\0A\00", comdat, align 1
+
+; Function Attrs: nounwind
+define void @may_crash() #0 {
+entry:
+  store volatile i32 42, i32* null, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define i32 @main() #0 personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
+entry:
+  %myres = alloca i32, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  store i32 0, i32* %myres, align 4
+  invoke void @may_crash() #4
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %0 = call i8* @llvm.frameaddress(i32 0)
+  invoke void @"\01?fin$1@0@main@@"(i8 zeroext 0, i8* %0) #4
+          to label %invoke.cont.2 unwind label %lpad.1
+
+invoke.cont.2:                                    ; preds = %invoke.cont
+  %1 = call i8* @llvm.frameaddress(i32 0)
+  call void @"\01?fin$0@0@main@@"(i8 zeroext 0, i8* %1)
+  ret i32 0
+
+lpad:                                             ; preds = %entry
+  %2 = landingpad { i8*, i32 }
+          cleanup
+  %3 = extractvalue { i8*, i32 } %2, 0
+  store i8* %3, i8** %exn.slot
+  %4 = extractvalue { i8*, i32 } %2, 1
+  store i32 %4, i32* %ehselector.slot
+  %5 = call i8* @llvm.frameaddress(i32 0)
+  invoke void @"\01?fin$1@0@main@@"(i8 zeroext 1, i8* %5) #4
+          to label %invoke.cont.3 unwind label %lpad.1
+
+lpad.1:                                           ; preds = %lpad, %invoke.cont
+  %6 = landingpad { i8*, i32 }
+          cleanup
+  %7 = extractvalue { i8*, i32 } %6, 0
+  store i8* %7, i8** %exn.slot
+  %8 = extractvalue { i8*, i32 } %6, 1
+  store i32 %8, i32* %ehselector.slot
+  br label %ehcleanup
+
+invoke.cont.3:                                    ; preds = %lpad
+  br label %ehcleanup
+
+ehcleanup:                                        ; preds = %invoke.cont.3, %lpad.1
+  %9 = call i8* @llvm.frameaddress(i32 0)
+  call void @"\01?fin$0@0@main@@"(i8 zeroext 1, i8* %9)
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %ehcleanup
+  %exn = load i8*, i8** %exn.slot
+  %sel = load i32, i32* %ehselector.slot
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
+  %lpad.val.4 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
+  resume { i8*, i32 } %lpad.val.4
+}
+
+; CHECK-LABEL: define i32 @main()
+; CHECK: invoke void @may_crash()
+;
+; CHECK: landingpad { i8*, i32 }
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void ()* @main.cleanup)
+; CHECK-NEXT: indirectbr
+;
+; CHECK: landingpad { i8*, i32 }
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 0, void ()* @main.cleanup.1)
+; CHECK-NEXT: indirectbr
+
+; CHECK-LABEL: define internal void @main.cleanup()
+; CHECK: call i8* @llvm.frameaddress(i32 1)
+; CHECK: call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %{{.*}})
+; CHECK: call void @"\01?fin$1@0@main@@"(i8 zeroext 1, i8* %{{.*}})
+; CHECK: call void @"\01?fin$0@0@main@@"(i8 zeroext 1, i8* %{{.*}})
+
+; CHECK-LABEL: define internal void @main.cleanup.1()
+; CHECK: call i8* @llvm.frameaddress(i32 1)
+; CHECK: call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %{{.*}})
+; CHECK: call void @"\01?fin$0@0@main@@"(i8 zeroext 1, i8* %{{.*}})
+
+; Function Attrs: noinline nounwind
+define internal void @"\01?fin$0@0@main@@"(i8 zeroext %abnormal_termination, i8* %frame_pointer) #1 {
+entry:
+  %frame_pointer.addr = alloca i8*, align 4
+  %abnormal_termination.addr = alloca i8, align 1
+  %0 = call i8* @llvm.frameaddress(i32 1)
+  %1 = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %0)
+  store i8* %frame_pointer, i8** %frame_pointer.addr, align 4
+  store i8 %abnormal_termination, i8* %abnormal_termination.addr, align 1
+  %2 = zext i8 %abnormal_termination to i32
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @"\01??_C@_0BC@LHHILCPN@outer?5finally?5?$CFd?6?$AA@", i32 0, i32 0), i32 %2)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.frameaddress(i32) #2
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.x86.seh.recoverfp(i8*, i8*) #2
+
+declare i32 @printf(i8*, ...) #3
+
+; Function Attrs: noinline nounwind
+define internal void @"\01?fin$1@0@main@@"(i8 zeroext %abnormal_termination, i8* %frame_pointer) #1 {
+entry:
+  %frame_pointer.addr = alloca i8*, align 4
+  %abnormal_termination.addr = alloca i8, align 1
+  %0 = call i8* @llvm.frameaddress(i32 1)
+  %1 = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %0)
+  store i8* %frame_pointer, i8** %frame_pointer.addr, align 4
+  store i8 %abnormal_termination, i8* %abnormal_termination.addr, align 1
+  %2 = zext i8 %abnormal_termination to i32
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @"\01??_C@_0BC@JELAHKN@inner?5finally?5?$CFd?6?$AA@", i32 0, i32 0), i32 %2)
+  call void @may_crash()
+  ret void
+}
+
+declare i32 @_except_handler3(...)
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { noinline }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.7.0 "}
diff --git a/test/CodeGen/WinEH/seh-outlined-finally.ll b/test/CodeGen/WinEH/seh-outlined-finally.ll
index 3c27212192dd..529f85b9602b 100644
--- a/test/CodeGen/WinEH/seh-outlined-finally.ll
+++ b/test/CodeGen/WinEH/seh-outlined-finally.ll
@@ -49,12 +49,12 @@ entry:
           to label %invoke.cont unwind label %lpad
 
 invoke.cont:                                      ; preds = %entry
-  %0 = call i8* @llvm.frameaddress(i32 0)
+  %0 = call i8* @llvm.localaddress()
   invoke void @"\01?fin$1@0@main@@"(i1 zeroext false, i8* %0) #4
           to label %invoke.cont2 unwind label %lpad1
 
 invoke.cont2:                                     ; preds = %invoke.cont
-  %1 = call i8* @llvm.frameaddress(i32 0)
+  %1 = call i8* @llvm.localaddress()
   call void @"\01?fin$0@0@main@@"(i1 zeroext false, i8* %1)
   ret i32 0
 
@@ -65,7 +65,7 @@ lpad:                                             ; preds = %entry
   store i8* %3, i8** %exn.slot
   %4 = extractvalue { i8*, i32 } %2, 1
   store i32 %4, i32* %ehselector.slot
-  %5 = call i8* @llvm.frameaddress(i32 0)
+  %5 = call i8* @llvm.localaddress()
   invoke void @"\01?fin$1@0@main@@"(i1 zeroext true, i8* %5) #4
           to label %invoke.cont3 unwind label %lpad1
 
@@ -82,7 +82,7 @@ invoke.cont3:                                     ; preds = %lpad
   br label %ehcleanup
 
 ehcleanup:                                        ; preds = %invoke.cont3, %lpad1
-  %9 = call i8* @llvm.frameaddress(i32 0)
+  %9 = call i8* @llvm.localaddress()
   call void @"\01?fin$0@0@main@@"(i1 zeroext true, i8* %9)
   br label %eh.resume
 
@@ -146,7 +146,7 @@ entry:
 declare i32 @__C_specific_handler(...)
 
 ; Function Attrs: nounwind readnone
-declare i8* @llvm.frameaddress(i32) #3
+declare i8* @llvm.localaddress() #3
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/WinEH/seh-prepared-basic.ll b/test/CodeGen/WinEH/seh-prepared-basic.ll
index b981dc2d9bd8..b6a30309f1c1 100644
--- a/test/CodeGen/WinEH/seh-prepared-basic.ll
+++ b/test/CodeGen/WinEH/seh-prepared-basic.ll
@@ -17,7 +17,7 @@ target triple = "x86_64-pc-windows-msvc"
 ; Function Attrs: uwtable
 define void @do_except() #0 personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
 entry:
-  call void (...) @llvm.frameescape()
+  call void (...) @llvm.localescape()
   invoke void @g() #5
           to label %__try.cont unwind label %lpad1
 
@@ -64,10 +64,10 @@ declare i32 @llvm.eh.typeid.for(i8*) #3
 declare i8* @llvm.eh.actions(...) #4
 
 ; Function Attrs: nounwind
-declare void @llvm.frameescape(...) #4
+declare void @llvm.localescape(...) #4
 
 ; Function Attrs: nounwind readnone
-declare i8* @llvm.framerecover(i8*, i8*, i32) #3
+declare i8* @llvm.localrecover(i8*, i8*, i32) #3
 
 attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" "wineh-parent"="do_except" }
 attributes #1 = { noinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/WinEH/seh-simple.ll b/test/CodeGen/WinEH/seh-simple.ll
index 98f06ef12c9f..060186484aec 100644
--- a/test/CodeGen/WinEH/seh-simple.ll
+++ b/test/CodeGen/WinEH/seh-simple.ll
@@ -107,6 +107,38 @@ eh.resume:
 ; CHECK-NEXT: %r = phi i32 [ 0, %entry ], [ 1, %lpad.return_crit_edge ]
 ; CHECK-NEXT: ret i32 %r
 
+define i32 @except_join() personality i32 (...)* @__C_specific_handler {
+entry:
+  invoke void @might_crash()
+          to label %return unwind label %lpad
+
+lpad:
+  %ehvals = landingpad { i8*, i32 }
+          catch i32 ()* @filt
+  %sel = extractvalue { i8*, i32 } %ehvals, 1
+  %filt_sel = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @filt to i8*))
+  %matches = icmp eq i32 %sel, %filt_sel
+  br i1 %matches, label %return, label %eh.resume
+
+return:
+  ret i32 0
+
+eh.resume:
+  resume { i8*, i32 } %ehvals
+}
+
+; CHECK-LABEL: define i32 @except_join()
+; CHECK: landingpad { i8*, i32 }
+; CHECK-NEXT: catch i32 ()* @filt
+; CHECK-NEXT: call i8* (...) @llvm.eh.actions(i32 1, i8* bitcast (i32 ()* @filt to i8*), i32 -1, i8* blockaddress(@except_join, %lpad.return_crit_edge))
+; CHECK-NEXT: indirectbr {{.*}} [label %lpad.return_crit_edge]
+;
+; CHECK: lpad.return_crit_edge:
+; CHECK: br label %return
+;
+; CHECK: return:
+; CHECK-NEXT: ret i32 0
+
 define i32 @lpad_phi() personality i32 (...)* @__C_specific_handler {
 entry:
   invoke void @might_crash()
@@ -196,6 +228,6 @@ eh.resume:
 ; X64-LABEL: define internal void @lpad_phi.cleanup(i8*, i8*)
 ; X86-LABEL: define internal void @lpad_phi.cleanup()
 ; X86: call i8* @llvm.frameaddress(i32 1)
-; CHECK: call i8* @llvm.framerecover({{.*}})
+; CHECK: call i8* @llvm.localrecover({{.*}})
 ; CHECK: load i32
 ; CHECK: store i32 %{{.*}}, i32*
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index 74d20f348b52..4e43f6f51921 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -269,7 +269,7 @@ entry:
 define <4 x double> @vperm2z_0x08(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x08:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 $40, %ymm0, %ymm0, %ymm0
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
   ret <4 x double> %s
@@ -279,7 +279,7 @@ define <4 x double> @vperm2z_0x18(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x18:
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; ALL-NEXT:    vblendpd $12, %ymm0, %ymm1, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   ret <4 x double> %s
@@ -288,7 +288,7 @@ define <4 x double> @vperm2z_0x18(<4 x double> %a) {
 define <4 x double> @vperm2z_0x28(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x28:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 $40, %ymm0, %ymm0, %ymm0
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x double> %s
@@ -298,7 +298,7 @@ define <4 x double> @vperm2z_0x38(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x38:
 ; ALL:       # BB#0:
 ; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; ALL-NEXT:    vblendpd $12, %ymm0, %ymm1, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
   ret <4 x double> %s
@@ -307,7 +307,7 @@ define <4 x double> @vperm2z_0x38(<4 x double> %a) {
 define <4 x double> @vperm2z_0x80(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x80:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 $128, %ymm0, %ymm0, %ymm0
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],zero,zero
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x double> %s
@@ -316,7 +316,7 @@ define <4 x double> @vperm2z_0x80(<4 x double> %a) {
 define <4 x double> @vperm2z_0x81(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x81:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 $129, %ymm0, %ymm0, %ymm0
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   ret <4 x double> %s
@@ -325,7 +325,7 @@ define <4 x double> @vperm2z_0x81(<4 x double> %a) {
 define <4 x double> @vperm2z_0x82(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x82:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 $128, %ymm0, %ymm0, %ymm0
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],zero,zero
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
   ret <4 x double> %s
@@ -334,7 +334,7 @@ define <4 x double> @vperm2z_0x82(<4 x double> %a) {
 define <4 x double> @vperm2z_0x83(<4 x double> %a) {
 ; ALL-LABEL: vperm2z_0x83:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vperm2f128 $129, %ymm0, %ymm0, %ymm0
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
 ; ALL-NEXT:    retq
   %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   ret <4 x double> %s
@@ -345,8 +345,8 @@ define <4 x double> @vperm2z_0x83(<4 x double> %a) {
 define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
 ; ALL-LABEL: vperm2z_int_0x83:
 ; ALL:       # BB#0:
-; AVX1:    vperm2f128 $129, %ymm0, %ymm0, %ymm0
-; AVX2:    vperm2i128 $129, %ymm0, %ymm0, %ymm0
+; AVX1:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; AVX2:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
   %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   %c = add <4 x i64> %b, %s
   ret <4 x i64> %c
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index b9f490b8a39a..7642cd4e6c5c 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -406,20 +406,6 @@ define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2
 }
 declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
 
- define <8 x i32> @test_cvtpd2udq(<8 x double> %a) {
- ;CHECK: vcvtpd2udq {ru-sae}{{.*}}encoding: [0x62,0xf1,0xfc,0x58,0x79,0xc0]
-  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %a, <8 x i32>zeroinitializer, i8 -1, i32 2)
-  ret <8 x i32>%res
- }
- declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
-
- define <16 x i32> @test_cvtps2udq(<16 x float> %a) {
- ;CHECK: vcvtps2udq {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x38,0x79,0xc0]
-  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %a, <16 x i32>zeroinitializer, i16 -1, i32 1)
-  ret <16 x i32>%res
- }
- declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
-
  define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
  ;CHECK: vcmpleps {sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x02]
    %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
@@ -434,35 +420,6 @@ declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) no
  }
  declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
 
- ; cvt intrinsics
- define <16 x float> @test_cvtdq2ps(<16 x i32> %a) {
- ;CHECK: vcvtdq2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x38,0x5b,0xc0]
-  %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %a, <16 x float>zeroinitializer, i16 -1, i32 1)
-  ret <16 x float>%res
- }
- declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32)
-
- define <16 x float> @test_cvtudq2ps(<16 x i32> %a) {
- ;CHECK: vcvtudq2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7f,0x38,0x7a,0xc0]
-  %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %a, <16 x float>zeroinitializer, i16 -1, i32 1)
-  ret <16 x float>%res
- }
- declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32)
-
- define <8 x double> @test_cvtdq2pd(<8 x i32> %a) {
- ;CHECK: vcvtdq2pd {{.*}}encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xc0]
-  %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %a, <8 x double>zeroinitializer, i8 -1)
-  ret <8 x double>%res
- }
- declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8)
-
- define <8 x double> @test_cvtudq2pd(<8 x i32> %a) {
- ;CHECK: vcvtudq2pd {{.*}}encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xc0]
-  %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %a, <8 x double>zeroinitializer, i8 -1)
-  ret <8 x double>%res
- }
- declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
-
  ; fp min - max
 define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
   ; CHECK: vmaxpd
@@ -482,13 +439,6 @@ define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
 declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>,
                     <8 x double>, i8, i32)
 
- define <8 x float> @test_cvtpd2ps(<8 x double> %a) {
- ;CHECK: vcvtpd2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0xfd,0x38,0x5a,0xc0]
-  %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %a, <8 x float>zeroinitializer, i8 -1, i32 1)
-  ret <8 x float>%res
- }
- declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
-
  declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
 
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_512
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 9574c016ad50..71bf63ed44d0 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -997,3 +997,44 @@ define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x
   ret <64 x i8> %res2
 }
 
+declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_512
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: {%k1} 
+; CHECK: vpmulhuw {{.*}}encoding: [0x62
+define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+  %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+  %res2 = add <32 x i16> %res, %res1
+  ret <32 x i16> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_512
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: {%k1} 
+; CHECK: vpmulhw {{.*}}encoding: [0x62
+define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+  %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+  %res2 = add <32 x i16> %res, %res1
+  ret <32 x i16> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_512
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: {%k1} 
+; CHECK: vpmulhrsw {{.*}}encoding: [0x62
+define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+  %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+  %res2 = add <32 x i16> %res, %res1
+  ret <32 x i16> %res2
+}
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index 0119d3945f4e..f5413896789a 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -3763,3 +3763,83 @@ define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16>
   ret <16 x i16> %res2
 }
 
+declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_128
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: {%k1} 
+; CHECK: vpmulhuw {{.*}}encoding: [0x62
+define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+  %res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  %res2 = add <8 x i16> %res, %res1
+  ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_256
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: {%k1} 
+; CHECK: vpmulhuw {{.*}}encoding: [0x62
+define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+  %res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+  %res2 = add <16 x i16> %res, %res1
+  ret <16 x i16> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_128
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: {%k1} 
+; CHECK: vpmulhw {{.*}}encoding: [0x62
+define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+  %res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  %res2 = add <8 x i16> %res, %res1
+  ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_256
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: {%k1} 
+; CHECK: vpmulhw {{.*}}encoding: [0x62
+define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+  %res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+  %res2 = add <16 x i16> %res, %res1
+  ret <16 x i16> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_128
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: {%k1} 
+; CHECK: vpmulhrsw {{.*}}encoding: [0x62
+define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+  %res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  %res2 = add <8 x i16> %res, %res1
+  ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_256
+; CHECK-NOT: call 
+; CHECK: kmov 
+; CHECK: {%k1} 
+; CHECK: vpmulhrsw {{.*}}encoding: [0x62
+define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+  %res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+  %res2 = add <16 x i16> %res, %res1
+  ret <16 x i16> %res2
+}
diff --git a/test/CodeGen/X86/cppeh-nounwind.ll b/test/CodeGen/X86/cppeh-nounwind.ll
new file mode 100644
index 000000000000..d9bc001a92df
--- /dev/null
+++ b/test/CodeGen/X86/cppeh-nounwind.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck %s
+
+; Sometimes invokes of nounwind functions make it through to CodeGen, especially
+; at -O0, where Clang sometimes optimistically annotates functions as nounwind.
+; WinEHPrepare ends up outlining functions, and emitting references to LSDA
+; labels. Make sure we emit the LSDA in that case.
+
+declare i32 @__CxxFrameHandler3(...)
+declare void @nounwind_func() nounwind
+declare void @cleanup()
+
+define void @should_emit_tables() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @nounwind_func()
+      to label %done unwind label %lpad
+
+done:
+  ret void
+
+lpad:
+  %vals = landingpad { i8*, i32 }
+      cleanup
+  call void @cleanup()
+  resume { i8*, i32 } %vals
+}
+
+; CHECK: _should_emit_tables:
+; CHECK: calll _nounwind_func
+; CHECK: retl
+
+; CHECK: L__ehtable$should_emit_tables:
+
+; CHECK: ___ehhandler$should_emit_tables:
+; CHECK: movl $L__ehtable$should_emit_tables, %eax
+; CHECK: jmp ___CxxFrameHandler3 # TAILCALL
diff --git a/test/CodeGen/X86/eh-nolandingpads.ll b/test/CodeGen/X86/eh-nolandingpads.ll
new file mode 100644
index 000000000000..962952266214
--- /dev/null
+++ b/test/CodeGen/X86/eh-nolandingpads.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
+; Test that we emit functions with explicitly specified personality,
+; even if no landing pads are left.
+
+declare i32 @__my_personality_v0(...)
+declare void @might_throw()
+
+define i32 @foo() personality i32 (...)* @__my_personality_v0 {
+; CHECK: .cfi_personality 3, __my_personality_v0
+    call void @might_throw()
+    ret i32 0
+}
diff --git a/test/CodeGen/X86/fdiv-combine.ll b/test/CodeGen/X86/fdiv-combine.ll
index 279bb0624ace..34eac62e3673 100644
--- a/test/CodeGen/X86/fdiv-combine.ll
+++ b/test/CodeGen/X86/fdiv-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
 
 ; Anything more than one division using a single divisor operand
 ; should be converted into a reciprocal and multiplication.
@@ -17,9 +17,9 @@ define float @div2_arcp(float %x, float %y, float %z) #0 {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    divss %xmm2, %xmm3
-; CHECK-NEXT:    mulss %xmm3, %xmm0
 ; CHECK-NEXT:    mulss %xmm1, %xmm0
 ; CHECK-NEXT:    mulss %xmm3, %xmm0
+; CHECK-NEXT:    mulss %xmm3, %xmm0
 ; CHECK-NEXT:    retq
   %div1 = fdiv arcp float %x, %z
   %mul = fmul arcp float %div1, %y
@@ -27,5 +27,22 @@ define float @div2_arcp(float %x, float %y, float %z) #0 {
   ret float %div2
 }
 
+; If the reciprocal is already calculated, we should not
+; generate an extra multiplication by 1.0. 
+
+define double @div3_arcp(double %x, double %y, double %z) #0 {
+; CHECK-LABEL: div3_arcp:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd{{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    divsd %xmm1, %xmm2
+; CHECK-NEXT:    mulsd %xmm2, %xmm0
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %div1 = fdiv fast double 1.0, %y
+  %div2 = fdiv fast double %x, %y
+  %ret = fadd fast double %div2, %div1
+  ret double %ret
+}
+
 ; FIXME: If the backend understands 'arcp', then this attribute is unnecessary.
 attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/frameescape.ll b/test/CodeGen/X86/frameescape.ll
index 00bc55d24878..179a936304ba 100644
--- a/test/CodeGen/X86/frameescape.ll
+++ b/test/CodeGen/X86/frameescape.ll
@@ -1,19 +1,19 @@
 ; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86
 ; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
 
-declare void @llvm.frameescape(...)
+declare void @llvm.localescape(...)
 declare i8* @llvm.frameaddress(i32)
-declare i8* @llvm.framerecover(i8*, i8*, i32)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
 declare i32 @printf(i8*, ...)
 
 @str = internal constant [10 x i8] c"asdf: %d\0A\00"
 
 define void @print_framealloc_from_fp(i8* %fp) {
-  %a.i8 = call i8* @llvm.framerecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 0)
+  %a.i8 = call i8* @llvm.localrecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 0)
   %a = bitcast i8* %a.i8 to i32*
   %a.val = load i32, i32* %a
   call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %a.val)
-  %b.i8 = call i8* @llvm.framerecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 1)
+  %b.i8 = call i8* @llvm.localrecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 1)
   %b = bitcast i8* %b.i8 to i32*
   %b.val = load i32, i32* %b
   call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %b.val)
@@ -61,7 +61,7 @@ define void @print_framealloc_from_fp(i8* %fp) {
 define void @alloc_func() {
   %a = alloca i32
   %b = alloca i32, i32 2
-  call void (...) @llvm.frameescape(i32* %a, i32* %b)
+  call void (...) @llvm.localescape(i32* %a, i32* %b)
   store i32 42, i32* %a
   store i32 13, i32* %b
   %fp = call i8* @llvm.frameaddress(i32 0)
@@ -105,7 +105,7 @@ define i32 @main() {
 define void @alloc_func_no_frameaddr() {
   %a = alloca i32
   %b = alloca i32
-  call void (...) @llvm.frameescape(i32* %a, i32* %b)
+  call void (...) @llvm.localescape(i32* %a, i32* %b)
   store i32 42, i32* %a
   store i32 13, i32* %b
   call void @print_framealloc_from_fp(i8* null)
diff --git a/test/CodeGen/X86/frameregister.ll b/test/CodeGen/X86/frameregister.ll
new file mode 100644
index 000000000000..826bb9d78c9d
--- /dev/null
+++ b/test/CodeGen/X86/frameregister.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin  | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnueabi | FileCheck %s
+; RUN: opt < %s -O3 -S -mtriple=x86_64-linux-gnueabi | FileCheck %s --check-prefix=OPT
+
+define i64 @get_frame() #0 {
+entry:
+; CHECK-LABEL: get_frame:
+; CHECK: movq	%rbp, %rax
+  %sp = call i64 @llvm.read_register.i64(metadata !0)
+; OPT: @llvm.read_register.i64
+  ret i64 %sp
+}
+
+define void @set_frame(i64 %val) #0 {
+entry:
+; CHECK-LABEL: set_frame:
+; CHECK: movq	%rdi, %rbp
+  call void @llvm.write_register.i64(metadata !0, i64 %val)
+; OPT: @llvm.write_register.i64
+  ret void
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+declare void @llvm.write_register.i64(metadata, i64) nounwind
+
+; register unsigned long current_stack_pointer asm("rbp");
+; CHECK-NOT: .asciz  "rbp"
+!0 = !{!"rbp\00"}
+
+attributes #0 = { nounwind "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/X86/implicit-null-check-negative.ll b/test/CodeGen/X86/implicit-null-check-negative.ll
index 8fbed9f7bee8..c8d425c3889f 100644
--- a/test/CodeGen/X86/implicit-null-check-negative.ll
+++ b/test/CodeGen/X86/implicit-null-check-negative.ll
@@ -51,4 +51,46 @@ define i32 @imp_null_check_load_no_md(i32* %x) {
   ret i32 %t
 }
 
+define i32 @imp_null_check_no_hoist_over_acquire_load(i32* %x, i32* %y) {
+; We cannot hoist %t1 over %t0 since %t0 is an acquire load
+ entry:
+  %c = icmp eq i32* %x, null
+  br i1 %c, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+  ret i32 42
+
+ not_null:
+  %t0 = load atomic i32, i32* %y acquire, align 4
+  %t1 = load i32, i32* %x
+  %p = add i32 %t0, %t1
+  ret i32 %p
+}
+
+define i32 @imp_null_check_add_result(i32* %x, i32* %y) {
+; This will codegen to:
+;
+;   movl    (%rsi), %eax
+;   addl    (%rdi), %eax
+;
+; The load instruction we wish to hoist is the addl, but there is a
+; write-after-write hazard preventing that from happening.  We could
+; get fancy here and exploit the commutativity of addition, but right
+; now -implicit-null-checks isn't that smart.
+;
+
+ entry:
+  %c = icmp eq i32* %x, null
+  br i1 %c, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+  ret i32 42
+
+ not_null:
+  %t0 = load i32, i32* %y
+  %t1 = load i32, i32* %x
+  %p = add i32 %t0, %t1
+  ret i32 %p
+}
+
 !0 = !{}
diff --git a/test/CodeGen/X86/implicit-null-check.ll b/test/CodeGen/X86/implicit-null-check.ll
index 1d1b36bbd5d0..fd7a902eefc1 100644
--- a/test/CodeGen/X86/implicit-null-check.ll
+++ b/test/CodeGen/X86/implicit-null-check.ll
@@ -76,6 +76,31 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
   ret i32 %p1
 }
 
+define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z) {
+; CHECK-LABEL: _imp_null_check_hoist_over_unrelated_load:
+; CHECK: Ltmp7:
+; CHECK: movl (%rdi), %eax
+; CHECK: movl (%rsi), %ecx
+; CHECK: movl %ecx, (%rdx)
+; CHECK: retq
+; CHECK: Ltmp6:
+; CHECK: movl	$42, %eax
+; CHECK: retq
+
+ entry:
+  %c = icmp eq i32* %x, null
+  br i1 %c, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+  ret i32 42
+
+ not_null:
+  %t0 = load i32, i32* %y
+  %t1 = load i32, i32* %x
+  store i32 %t0, i32* %z
+  ret i32 %t1
+}
+
 !0 = !{}
 
 ; CHECK-LABEL: __LLVM_FaultMaps:
@@ -88,7 +113,7 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
 ; CHECK-NEXT: .short 0
 
 ; # functions:
-; CHECK-NEXT: .long 3
+; CHECK-NEXT: .long 4
 
 ; FunctionAddr:
 ; CHECK-NEXT: .quad _imp_null_check_add_result
@@ -117,6 +142,19 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
 ; CHECK-NEXT: .long Ltmp2-_imp_null_check_gep_load
 
 ; FunctionAddr:
+; CHECK-NEXT: .quad _imp_null_check_hoist_over_unrelated_load
+; NumFaultingPCs
+; CHECK-NEXT: .long 1
+; Reserved:
+; CHECK-NEXT: .long 0
+; Fault[0].Type:
+; CHECK-NEXT: .long 1
+; Fault[0].FaultOffset:
+; CHECK-NEXT: .long Ltmp7-_imp_null_check_hoist_over_unrelated_load
+; Fault[0].HandlerOffset:
+; CHECK-NEXT: .long Ltmp6-_imp_null_check_hoist_over_unrelated_load
+
+; FunctionAddr:
 ; CHECK-NEXT: .quad _imp_null_check_load
 ; NumFaultingPCs
 ; CHECK-NEXT: .long 1
@@ -131,10 +169,12 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
 
 ; OBJDUMP: FaultMap table:
 ; OBJDUMP-NEXT: Version: 0x1
-; OBJDUMP-NEXT: NumFunctions: 3
+; OBJDUMP-NEXT: NumFunctions: 4
 ; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
 ; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 5
 ; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
 ; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 7
 ; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
+; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 7
+; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
 ; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 3
diff --git a/test/CodeGen/X86/inline-asm-bad-constraint-n.ll b/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
new file mode 100644
index 000000000000..91b1ffed4e0f
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
@@ -0,0 +1,10 @@
+; RUN: not llc -march=x86 -no-integrated-as < %s 2>&1 | FileCheck %s
+
+@x = global i32 0, align 4
+
+;CHECK:	error: invalid operand for inline asm constraint 'n'
+define void @foo() {
+  %a = getelementptr i32, i32* @x, i32 1
+  call void asm sideeffect "foo $0", "n"(i32* %a) nounwind
+  ret void
+}
diff --git a/test/CodeGen/X86/legalize-shl-vec.ll b/test/CodeGen/X86/legalize-shl-vec.ll
new file mode 100644
index 000000000000..7ec2a663513f
--- /dev/null
+++ b/test/CodeGen/X86/legalize-shl-vec.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+define <2 x i256> @test_shl(<2 x i256> %In) {
+  %Amt = insertelement <2 x i256> undef, i256 -1, i32 0
+  %Out = shl <2 x i256> %In, %Amt
+  ret <2 x i256> %Out
+
+  ; CHECK-LABEL: test_shl
+  ; CHECK:       movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK:       retq
+}
+
+define <2 x i256> @test_srl(<2 x i256> %In) {
+  %Amt = insertelement <2 x i256> undef, i256 -1, i32 0
+  %Out = lshr <2 x i256> %In, %Amt
+  ret <2 x i256> %Out
+
+  ; CHECK-LABEL: test_srl
+  ; CHECK:       movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK-NEXT:  movq $0
+  ; CHECK:       retq
+}
+
+define <2 x i256> @test_sra(<2 x i256> %In) {
+  %Amt = insertelement <2 x i256> undef, i256 -1, i32 0
+  %Out = ashr <2 x i256> %In, %Amt
+  ret <2 x i256> %Out
+
+  ; CHECK-LABEL: test_sra
+  ; CHECK:       sarq $63
+}
diff --git a/test/CodeGen/X86/machine-combiner.ll b/test/CodeGen/X86/machine-combiner.ll
index d4cd59ffac3a..0943bebbb099 100644
--- a/test/CodeGen/X86/machine-combiner.ll
+++ b/test/CodeGen/X86/machine-combiner.ll
@@ -1,15 +1,23 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX
 
 ; Verify that the first two adds are independent regardless of how the inputs are
 ; commuted. The destination registers are used as source registers for the third add.
 
 define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds1:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %t0, %x2
   %t2 = fadd float %t1, %x3
@@ -17,12 +25,19 @@ define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
 }
 
 define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds2:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %x2, %t0
   %t2 = fadd float %t1, %x3
@@ -30,12 +45,19 @@ define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
 }
 
 define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds3:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %t0, %x2
   %t2 = fadd float %x3, %t1
@@ -43,12 +65,19 @@ define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
 }
 
 define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds4:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %x2, %t0
   %t2 = fadd float %x3, %t1
@@ -59,16 +88,27 @@ define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
 ; produced because that would cost more compile time.
 
 define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
-; CHECK-LABEL: reassociate_adds5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm5, %xmm4, %xmm1
-; CHECK-NEXT:    vaddss %xmm6, %xmm1, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm7, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds5:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    addss %xmm5, %xmm4
+; SSE-NEXT:    addss %xmm6, %xmm4
+; SSE-NEXT:    addss %xmm4, %xmm0
+; SSE-NEXT:    addss %xmm7, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm5, %xmm4, %xmm1
+; AVX-NEXT:    vaddss %xmm6, %xmm1, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm7, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %t0, %x2
   %t2 = fadd float %t1, %x3
@@ -83,17 +123,90 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, floa
 ; Also, we should reassociate such that the result of the high latency division
 ; is used by the final 'add' rather than reassociating the %x3 operand with the
 ; division. The latter reassociation would not improve anything.
- 
+
 define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vdivss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds6:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fdiv float %x0, %x1
   %t1 = fadd float %x2, %t0
   %t2 = fadd float %x3, %t1
   ret float %t2
 }
 
+; Verify that SSE and AVX scalar single-precison multiplies are reassociated.
+
+define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) {
+; SSE-LABEL: reassociate_muls1:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    mulss %xmm3, %xmm2
+; SSE-NEXT:    mulss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_muls1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fdiv float %x0, %x1
+  %t1 = fmul float %x2, %t0
+  %t2 = fmul float %x3, %t1
+  ret float %t2
+}
+
+; Verify that SSE and AVX scalar double-precison adds are reassociated.
+
+define double @reassociate_adds_double(double %x0, double %x1, double %x2, double %x3) {
+; SSE-LABEL: reassociate_adds_double:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    addsd %xmm3, %xmm2
+; SSE-NEXT:    addsd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds_double:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddsd %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fdiv double %x0, %x1
+  %t1 = fadd double %x2, %t0
+  %t2 = fadd double %x3, %t1
+  ret double %t2
+}
+
+; Verify that SSE and AVX scalar double-precison multiplies are reassociated.
+
+define double @reassociate_muls_double(double %x0, double %x1, double %x2, double %x3) {
+; SSE-LABEL: reassociate_muls_double:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    mulsd %xmm3, %xmm2
+; SSE-NEXT:    mulsd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_muls_double:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmulsd %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %t0 = fdiv double %x0, %x1
+  %t1 = fmul double %x2, %t0
+  %t2 = fmul double %x3, %t1
+  ret double %t2
+}
+
diff --git a/test/CodeGen/X86/pr13577.ll b/test/CodeGen/X86/pr13577.ll
index faaec262cb91..a6b721a7a6f1 100644
--- a/test/CodeGen/X86/pr13577.ll
+++ b/test/CodeGen/X86/pr13577.ll
@@ -1,5 +1,20 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64-darwin | FileCheck %s
 
+; CHECK-LABEL: LCPI0_0:
+; CHECK-NEXT: .long 4286578688
+; CHECK-LABEL: LCPI0_1:
+; CHECK-NEXT: .long 2139095040
+
+; CHECK-LABEL: foo:
+; CHECK: movq {{.*}}, %rax
+; CHECK: shlq $48, %rax
+; CHECK: sets %al
+; CHECK: testb %al, %al
+; CHECK: flds LCPI0_0(%rip)
+; CHECK: flds LCPI0_1(%rip)
+; CHECK: fcmovne %st(1), %st(0)
+; CHECK: fstp %st(1)
+; CHECK: retq
 define x86_fp80 @foo(x86_fp80 %a) {
   %1 = tail call x86_fp80 @copysignl(x86_fp80 0xK7FFF8000000000000000, x86_fp80 %a) nounwind readnone
   ret x86_fp80 %1
diff --git a/test/CodeGen/X86/read-fp-no-frame-pointer.ll b/test/CodeGen/X86/read-fp-no-frame-pointer.ll
new file mode 100644
index 000000000000..9f78c294ce88
--- /dev/null
+++ b/test/CodeGen/X86/read-fp-no-frame-pointer.ll
@@ -0,0 +1,12 @@
+; RUN: not llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_frame() nounwind {
+entry:
+; CHECK: register ebp is allocatable: function has no frame pointer
+  %fp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %fp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = !{!"ebp\00"}
diff --git a/test/CodeGen/X86/seh-catch-all-win32.ll b/test/CodeGen/X86/seh-catch-all-win32.ll
index 423b9914e99d..a4ea8ab78c79 100644
--- a/test/CodeGen/X86/seh-catch-all-win32.ll
+++ b/test/CodeGen/X86/seh-catch-all-win32.ll
@@ -10,14 +10,14 @@ declare void @crash()
 declare i32 @printf(i8* nocapture readonly, ...) nounwind
 declare i32 @llvm.eh.typeid.for(i8*)
 declare i8* @llvm.frameaddress(i32)
-declare i8* @llvm.framerecover(i8*, i8*, i32)
-declare void @llvm.frameescape(...)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
+declare void @llvm.localescape(...)
 declare i8* @llvm.x86.seh.recoverfp(i8*, i8*)
 
 define i32 @main() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
 entry:
   %__exceptioncode = alloca i32, align 4
-  call void (...) @llvm.frameescape(i32* %__exceptioncode)
+  call void (...) @llvm.localescape(i32* %__exceptioncode)
   invoke void @crash() #5
           to label %__try.cont unwind label %lpad
 
@@ -45,7 +45,7 @@ define internal i32 @"filt$main"() {
 entry:
   %ebp = tail call i8* @llvm.frameaddress(i32 1)
   %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp)
-  %code.i8 = tail call i8* @llvm.framerecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0)
+  %code.i8 = tail call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0)
   %__exceptioncode = bitcast i8* %code.i8 to i32*
   %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20
   %0 = bitcast i8* %info.addr to i32***
@@ -59,26 +59,38 @@ entry:
 ; Check that we can get the exception code from eax to the printf.
 
 ; CHECK-LABEL: _main:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+;       Ensure that we push *all* the CSRs, since they are clobbered by the
+;       __except block.
+; CHECK: pushl %ebx
+; CHECK: pushl %edi
+; CHECK: pushl %esi
+
 ; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
 ; CHECK: Lmain$frame_escape_1 = [[reg_offs:[-0-9]+]]
 ; CHECK: movl %esp, [[reg_offs]](%ebp)
 ; CHECK: movl $L__ehtable$main,
 ; 	EH state 0
-; CHECK: movl $0, -4(%ebp)
+; CHECK: movl $0, -16(%ebp)
 ; CHECK: calll _crash
+; CHECK: popl %esi
+; CHECK: popl %edi
+; CHECK: popl %ebx
 ; CHECK: retl
 ; CHECK: # Block address taken
 ; 	stackrestore
-; CHECK: movl [[reg_offs]](%ebp), %esp
+; CHECK: movl -24(%ebp), %esp
 ; 	EH state -1
 ; CHECK: movl [[code_offs]](%ebp), %[[code:[a-z]+]]
-; CHECK: movl $-1, -4(%ebp)
+; CHECK: movl $-1, -16(%ebp)
 ; CHECK-DAG: movl %[[code]], 4(%esp)
 ; CHECK-DAG: movl $_str, (%esp)
 ; CHECK: calll _printf
 
 ; CHECK: .section .xdata,"dr"
 ; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1
+; CHECK: .align 4
 ; CHECK: L__ehtable$main
 ; CHECK-NEXT: .long -1
 ; CHECK-NEXT: .long _filt$main
diff --git a/test/CodeGen/X86/seh-except-finally.ll b/test/CodeGen/X86/seh-except-finally.ll
index 4327a64468f9..0630d001bb76 100644
--- a/test/CodeGen/X86/seh-except-finally.ll
+++ b/test/CodeGen/X86/seh-except-finally.ll
@@ -41,7 +41,7 @@ entry:
           to label %invoke.cont unwind label %lpad
 
 invoke.cont:                                      ; preds = %entry
-  %0 = call i8* @llvm.frameaddress(i32 0)
+  %0 = call i8* @llvm.localaddress()
   invoke void @"\01?fin$0@0@use_both@@"(i1 zeroext false, i8* %0) #5
           to label %invoke.cont2 unwind label %lpad1
 
@@ -56,7 +56,7 @@ lpad:                                             ; preds = %entry
   store i8* %2, i8** %exn.slot
   %3 = extractvalue { i8*, i32 } %1, 1
   store i32 %3, i32* %ehselector.slot
-  %4 = call i8* @llvm.frameaddress(i32 0)
+  %4 = call i8* @llvm.localaddress()
   invoke void @"\01?fin$0@0@use_both@@"(i1 zeroext true, i8* %4) #5
           to label %invoke.cont3 unwind label %lpad1
 
@@ -153,7 +153,7 @@ declare i32 @puts(i8*) #3
 declare i32 @__C_specific_handler(...)
 
 ; Function Attrs: nounwind readnone
-declare i8* @llvm.frameaddress(i32) #4
+declare i8* @llvm.localaddress() #4
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.eh.typeid.for(i8*) #4
diff --git a/test/CodeGen/X86/seh-stack-realign-win32.ll b/test/CodeGen/X86/seh-stack-realign-win32.ll
new file mode 100644
index 000000000000..f3ab71803ca7
--- /dev/null
+++ b/test/CodeGen/X86/seh-stack-realign-win32.ll
@@ -0,0 +1,99 @@
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
+
+; 32-bit catch-all has to use a filter function because that's how it saves the
+; exception code.
+
+@str = linkonce_odr unnamed_addr constant [27 x i8] c"GetExceptionCode(): 0x%lx\0A\00", align 1
+
+declare i32 @_except_handler3(...)
+declare void @crash()
+declare i32 @printf(i8* nocapture readonly, ...) nounwind
+declare i32 @llvm.eh.typeid.for(i8*)
+declare i8* @llvm.frameaddress(i32)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
+declare void @llvm.localescape(...)
+declare i8* @llvm.x86.seh.recoverfp(i8*, i8*)
+
+define i32 @main() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
+entry:
+  ; The EH code allocation is overaligned, triggering realignment.
+  %__exceptioncode = alloca i32, align 8
+  call void (...) @llvm.localescape(i32* %__exceptioncode)
+  invoke void @crash() #5
+          to label %__try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 }
+          catch i8* bitcast (i32 ()* @"filt$main" to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 1
+  %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @"filt$main" to i8*)) #4
+  %matches = icmp eq i32 %1, %2
+  br i1 %matches, label %__except, label %eh.resume
+
+__except:                                         ; preds = %lpad
+  %3 = load i32, i32* %__exceptioncode, align 4
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i32 0, i32 0), i32 %3) #4
+  br label %__try.cont
+
+__try.cont:                                       ; preds = %entry, %__except
+  ret i32 0
+
+eh.resume:                                        ; preds = %lpad
+  resume { i8*, i32 } %0
+}
+
+define internal i32 @"filt$main"() {
+entry:
+  %ebp = tail call i8* @llvm.frameaddress(i32 1)
+  %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp)
+  %code.i8 = tail call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0)
+  %__exceptioncode = bitcast i8* %code.i8 to i32*
+  %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20
+  %0 = bitcast i8* %info.addr to i32***
+  %1 = load i32**, i32*** %0, align 4
+  %2 = load i32*, i32** %1, align 4
+  %3 = load i32, i32* %2, align 4
+  store i32 %3, i32* %__exceptioncode, align 4
+  ret i32 1
+}
+
+; Check that we can get the exception code from eax to the printf.
+
+; CHECK-LABEL: _main:
+; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
+; CHECK: Lmain$frame_escape_1 = [[reg_offs:[-0-9]+]]
+; CHECK: movl %esp, [[reg_offs]](%esi)
+; CHECK: movl $L__ehtable$main,
+;       EH state 0
+; CHECK: movl $0, 40(%esi)
+; CHECK: calll _crash
+; CHECK: retl
+; CHECK: # Block address taken
+;       stackrestore
+; CHECK: movl -24(%ebp), %esp
+; CHECK: movl $Lmain$parent_frame_offset, %eax
+; CHECK: negl %eax
+; CHECK: leal -24(%ebp,%eax), %esi
+; CHECK: movl 12(%esi), %ebp    # 4-byte Reload
+;       EH state -1
+; CHECK: movl [[code_offs]](%esi), %[[code:[a-z]+]]
+; CHECK: movl $-1, 40(%esi)
+; CHECK-DAG: movl %[[code]], 4(%esp)
+; CHECK-DAG: movl $_str, (%esp)
+; CHECK: calll _printf
+
+; CHECK: .section .xdata,"dr"
+; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1
+; CHECK: L__ehtable$main
+; CHECK-NEXT: .long -1
+; CHECK-NEXT: .long _filt$main
+; CHECK-NEXT: .long Ltmp{{[0-9]+}}
+
+; CHECK-LABEL: _filt$main:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: movl (%ebp), %[[oldebp:[a-z]+]]
+; CHECK: movl -20(%[[oldebp]]), %[[ehinfo:[a-z]+]]
+; CHECK: movl (%[[ehinfo]]), %[[ehrec:[a-z]+]]
+; CHECK: movl (%[[ehrec]]), %[[ehcode:[a-z]+]]
+; CHECK: movl %[[ehcode]], {{.*}}(%{{.*}})
diff --git a/test/CodeGen/X86/seh-stack-realign.ll b/test/CodeGen/X86/seh-stack-realign.ll
new file mode 100644
index 000000000000..f2fb28a081f9
--- /dev/null
+++ b/test/CodeGen/X86/seh-stack-realign.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
+
+; 32-bit catch-all has to use a filter function because that's how it saves the
+; exception code.
+
+@str = linkonce_odr unnamed_addr constant [27 x i8] c"GetExceptionCode(): 0x%lx\0A\00", align 1
+
+declare i32 @_except_handler3(...)
+declare void @crash()
+declare i32 @printf(i8* nocapture readonly, ...) nounwind
+declare i32 @llvm.eh.typeid.for(i8*)
+declare i8* @llvm.frameaddress(i32)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
+declare void @llvm.localescape(...)
+declare i8* @llvm.x86.seh.recoverfp(i8*, i8*)
+
+define i32 @main() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
+entry:
+  ; The EH code allocation is overaligned, triggering realignment.
+  %__exceptioncode = alloca i32, align 8
+  call void (...) @llvm.localescape(i32* %__exceptioncode)
+  invoke void @crash() #5
+          to label %__try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 }
+          catch i8* bitcast (i32 ()* @"filt$main" to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 1
+  %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @"filt$main" to i8*)) #4
+  %matches = icmp eq i32 %1, %2
+  br i1 %matches, label %__except, label %eh.resume
+
+__except:                                         ; preds = %lpad
+  %3 = load i32, i32* %__exceptioncode, align 4
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i32 0, i32 0), i32 %3) #4
+  br label %__try.cont
+
+__try.cont:                                       ; preds = %entry, %__except
+  ret i32 0
+
+eh.resume:                                        ; preds = %lpad
+  resume { i8*, i32 } %0
+}
+
+define internal i32 @"filt$main"() {
+entry:
+  %ebp = tail call i8* @llvm.frameaddress(i32 1)
+  %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp)
+  %code.i8 = tail call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0)
+  %__exceptioncode = bitcast i8* %code.i8 to i32*
+  %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20
+  %0 = bitcast i8* %info.addr to i32***
+  %1 = load i32**, i32*** %0, align 4
+  %2 = load i32*, i32** %1, align 4
+  %3 = load i32, i32* %2, align 4
+  store i32 %3, i32* %__exceptioncode, align 4
+  ret i32 1
+}
+
+; Check that we can get the exception code from eax to the printf.
+
+; CHECK-LABEL: _main:
+; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
+; CHECK: Lmain$frame_escape_1 = [[reg_offs:[-0-9]+]]
+; CHECK: movl %esp, [[reg_offs]](%esi)
+; CHECK: movl $L__ehtable$main,
+;       EH state 0
+; CHECK: movl $0, 40(%esi)
+; CHECK: calll _crash
+; CHECK: retl
+; CHECK: # Block address taken
+;       Restore ESP
+; CHECK: movl -24(%ebp), %esp
+;       Restore ESI
+; CHECK: movl $Lmain$parent_frame_offset, %eax
+; CHECK: negl %eax
+; CHECK: leal -24(%ebp,%eax), %esi
+;       Restore EBP
+; CHECK: movl 12(%esi), %ebp    # 4-byte Reload
+;       EH state -1
+; CHECK: movl [[code_offs]](%esi), %[[code:[a-z]+]]
+; CHECK: movl $-1, 40(%esi)
+; CHECK-DAG: movl %[[code]], 4(%esp)
+; CHECK-DAG: movl $_str, (%esp)
+; CHECK: calll _printf
+
+; CHECK: .section .xdata,"dr"
+; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1
+; CHECK: L__ehtable$main
+; CHECK-NEXT: .long -1
+; CHECK-NEXT: .long _filt$main
+; CHECK-NEXT: .long Ltmp{{[0-9]+}}
+
+; CHECK-LABEL: _filt$main:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: movl (%ebp), %[[oldebp:[a-z]+]]
+; CHECK: movl -20(%[[oldebp]]), %[[ehinfo:[a-z]+]]
+; CHECK: movl (%[[ehinfo]]), %[[ehrec:[a-z]+]]
+; CHECK: movl (%[[ehrec]]), %[[ehcode:[a-z]+]]
+; CHECK: movl %[[ehcode]], {{.*}}(%{{.*}})
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index 0f8d9f4d713f..9b851db8121c 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -34,11 +34,11 @@ define float @ff(float %f) #0 {
 ; ESTIMATE:       # BB#0:
 ; ESTIMATE-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
 ; ESTIMATE-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm2
-; ESTIMATE-NEXT:    vmulss %xmm1, %xmm1, %xmm1
-; ESTIMATE-NEXT:    vmulss %xmm0, %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulss %xmm0, %xmm1, %xmm3
+; ESTIMATE-NEXT:    vmulss %xmm3, %xmm1, %xmm1
 ; ESTIMATE-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulss %xmm0, %xmm2, %xmm2
 ; ESTIMATE-NEXT:    vmulss %xmm2, %xmm1, %xmm1
-; ESTIMATE-NEXT:    vmulss %xmm1, %xmm0, %xmm1
 ; ESTIMATE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; ESTIMATE-NEXT:    vcmpeqss %xmm2, %xmm0, %xmm0
 ; ESTIMATE-NEXT:    vandnps %xmm1, %xmm0, %xmm0
@@ -78,7 +78,7 @@ define float @reciprocal_square_root(float %x) #0 {
 ; ESTIMATE:       # BB#0:
 ; ESTIMATE-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
 ; ESTIMATE-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm2
-; ESTIMATE-NEXT:    vmulss %xmm1, %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; ESTIMATE-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; ESTIMATE-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
 ; ESTIMATE-NEXT:    vmulss %xmm2, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/sse2-vector-shifts.ll b/test/CodeGen/X86/sse2-vector-shifts.ll
index 7c8d5e578898..45028cf4bd37 100644
--- a/test/CodeGen/X86/sse2-vector-shifts.ll
+++ b/test/CodeGen/X86/sse2-vector-shifts.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s
 
 ; SSE2 Logical Shift Left
 
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index c1cd91beaf53..398675276c66 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -1,6 +1,6 @@
 ; These are tests for SSE3 codegen.
 
-; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 --mattr=+sse3 | FileCheck %s --check-prefix=X64
 
 ; Test for v8xi16 lowering where we extract the first element of the vector and
 ; placed it in the second element of the result.
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll
index c7c1fc946386..63aa742bdf01 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1409,12 +1409,26 @@ define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) {
 }
 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
 
-; TODO stack_fold_roundsd
+define double @stack_fold_roundsd(double %a0) optsize {
+  ;CHECK-LABEL: stack_fold_roundsd
+  ;CHECK:       vroundsd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call double @llvm.floor.f64(double %a0)
+  ret double %2
+}
+declare double @llvm.floor.f64(double) nounwind readnone
 
 ; TODO stack_fold_roundsd_int
 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
 
-; TODO stack_fold_roundss
+define float @stack_fold_roundss(float %a0) optsize {
+  ;CHECK-LABEL: stack_fold_roundss
+  ;CHECK:       vroundss $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call float @llvm.floor.f32(float %a0)
+  ret float %2
+}
+declare float @llvm.floor.f32(float) nounwind readnone
 
 ; TODO stack_fold_roundss_int
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll
index 63acf5f4f96f..f9fcbaabdebb 100644
--- a/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -884,11 +884,29 @@ define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
 }
 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 
-; TODO stack_fold_roundsd
+define double @stack_fold_roundsd(double %a0) optsize {
+  ;CHECK-LABEL: stack_fold_roundsd
+  ;CHECK:       roundsd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call double @llvm.floor.f64(double %a0)
+  ret double %2
+}
+declare double @llvm.floor.f64(double) nounwind readnone
+
 ; TODO stack_fold_roundsd_int
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define float @stack_fold_roundss(float %a0) optsize {
+  ;CHECK-LABEL: stack_fold_roundss
+  ;CHECK:       roundss $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call float @llvm.floor.f32(float %a0)
+  ret float %2
+}
+declare float @llvm.floor.f32(float) nounwind readnone
 
-; TODO stack_fold_roundss
 ; TODO stack_fold_roundss_int
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
 
 ; TODO stack_fold_rsqrtps
 
@@ -938,13 +956,25 @@ define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
 }
 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
 
-; TODO stack_fold_sqrtsd
+define double @stack_fold_sqrtsd(double %a0) optsize {
+  ;CHECK-LABEL: stack_fold_sqrtsd
+  ;CHECK:       sqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call double @llvm.sqrt.f64(double %a0)
+  ret double %2
+}
 declare double @llvm.sqrt.f64(double) nounwind readnone
 
 ; TODO stack_fold_sqrtsd_int
 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
 
-; TODO stack_fold_sqrtss
+define float @stack_fold_sqrtss(float %a0) optsize {
+  ;CHECK-LABEL: stack_fold_sqrtss
+  ;CHECK:       sqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call float @llvm.sqrt.f32(float %a0)
+  ret float %2
+}
 declare float @llvm.sqrt.f32(float) nounwind readnone
 
 ; TODO stack_fold_sqrtss_int
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index 3e72212d85d3..3b1b2f5c1c77 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -1,5 +1,10 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL  --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+;
+; 32-bit tests to make sure we're not doing anything stupid.
+; RUN: llc < %s -mtriple=i686-unknown-unknown
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2
 
 ;
 ; Double to Signed Integer
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index ca8be65075b9..4a3d08813904 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -1,6 +1,11 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+;
+; 32-bit tests to make sure we're not doing anything stupid.
+; RUN: llc < %s -mtriple=i686-unknown-unknown
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2
 
 ;
 ; Signed Integer to Double
@@ -279,36 +284,19 @@ define <2 x double> @uitofp_2vf64_i32(<4 x i32> %a) {
 define <2 x double> @uitofp_2vf64_i16(<8 x i16> %a) {
 ; SSE2-LABEL: uitofp_2vf64_i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; SSE2-NEXT:    subpd %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm4, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-NEXT:    pand .LCPI10_0(%rip), %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: uitofp_2vf64_i16:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; AVX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vhaddpd %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX-NEXT:    vpand .LCPI10_0(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
   %cvt = uitofp <2 x i16> %shuf to <2 x double>
@@ -318,37 +306,20 @@ define <2 x double> @uitofp_2vf64_i16(<8 x i16> %a) {
 define <2 x double> @uitofp_2vf64_i8(<16 x i8> %a) {
 ; SSE2-LABEL: uitofp_2vf64_i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; SSE2-NEXT:    subpd %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm4, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pand .LCPI11_0(%rip), %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: uitofp_2vf64_i8:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; AVX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vhaddpd %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX-NEXT:    vpand .LCPI11_0(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
   %cvt = uitofp <2 x i8> %shuf to <2 x double>
@@ -493,34 +464,11 @@ define <4 x double> @uitofp_4vf64_i16(<8 x i16> %a) {
 ; SSE2-LABEL: uitofp_4vf64_i16:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
-; SSE2-NEXT:    subpd %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm5, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm1, %xmm5
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,5,6,7]
-; SSE2-NEXT:    pand .LCPI14_2(%rip), %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm5, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: uitofp_4vf64_i16:
@@ -536,38 +484,13 @@ define <4 x double> @uitofp_4vf64_i16(<8 x i16> %a) {
 define <4 x double> @uitofp_4vf64_i8(<16 x i8> %a) {
 ; SSE2-LABEL: uitofp_4vf64_i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
-; SSE2-NEXT:    subpd %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm5, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm4, %xmm5
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,5,6,7]
-; SSE2-NEXT:    pand .LCPI15_2(%rip), %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm4, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}}  xmm2 = xmm5[2,3,0,1]
-; SSE2-NEXT:    addpd %xmm5, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: uitofp_4vf64_i8:
diff --git a/test/CodeGen/X86/vector-gep.ll b/test/CodeGen/X86/vector-gep.ll
index ce98e6759b65..47878360ca0a 100644
--- a/test/CodeGen/X86/vector-gep.ll
+++ b/test/CodeGen/X86/vector-gep.ll
@@ -92,3 +92,25 @@ entry:
 ;CHECK: ret
 }
 
+;CHECK-LABEL: AGEP7:
+define <4 x i8*> @AGEP7(<4 x i8*> %param, i32 %off) nounwind {
+entry:
+;CHECK: vbroadcastss
+;CHECK: vpadd
+  %A = getelementptr i8, <4 x i8*> %param, i32 %off
+  ret <4 x i8*> %A
+;CHECK: ret
+}
+
+;CHECK-LABEL: AGEP8:
+define <4 x i16*> @AGEP8(i16* %param, <4 x i32> %off) nounwind {
+entry:
+; Multiply offset by two (add it to itself).
+;CHECK: vpadd
+; add the base to the offset
+;CHECK: vbroadcastss
+;CHECK-NEXT: vpadd
+  %A = getelementptr i16, i16* %param, <4 x i32> %off
+  ret <4 x i16*> %A
+;CHECK: ret
+}
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index aafc05b2ed4c..8e79493ddd07 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -160,14 +160,14 @@ entry:
 define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
 ; SSE2-LABEL: load_sext_test1:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movq (%rdi), %xmm0
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    psrad $16, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_sext_test1:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movq (%rdi), %xmm0
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSSE3-NEXT:    psrad $16, %xmm0
 ; SSSE3-NEXT:    retq
@@ -196,7 +196,7 @@ entry:
 define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
 ; SSE2-LABEL: load_sext_test2:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movd (%rdi), %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    psrad $24, %xmm0
@@ -204,7 +204,7 @@ define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
 ;
 ; SSSE3-LABEL: load_sext_test2:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movd (%rdi), %xmm0
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSSE3-NEXT:    psrad $24, %xmm0
@@ -280,7 +280,7 @@ entry:
 define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
 ; SSE2-LABEL: load_sext_test4:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movd (%rdi), %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
@@ -290,7 +290,7 @@ define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
 ;
 ; SSSE3-LABEL: load_sext_test4:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movd (%rdi), %xmm0
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $31, %xmm1
@@ -322,7 +322,7 @@ entry:
 define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
 ; SSE2-LABEL: load_sext_test5:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movq (%rdi), %xmm0
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -330,7 +330,7 @@ define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
 ;
 ; SSSE3-LABEL: load_sext_test5:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movq (%rdi), %xmm0
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $31, %xmm1
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -360,14 +360,14 @@ entry:
 define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
 ; SSE2-LABEL: load_sext_test6:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movq (%rdi), %xmm0
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_sext_test6:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movq (%rdi), %xmm0
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    psraw $8, %xmm0
 ; SSSE3-NEXT:    retq
@@ -463,20 +463,20 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
 define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
 ; SSE2-LABEL: sext_16i8_to_16i16:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movq (%rdi), %xmm0
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    movq 8(%rdi), %xmm1
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_16i8_to_16i16:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movq (%rdi), %xmm0
+; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    psraw $8, %xmm0
-; SSSE3-NEXT:    movq 8(%rdi), %xmm1
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    psraw $8, %xmm1
 ; SSSE3-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index 4fd2f8b51b8b..61b30154950d 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -10,43 +10,43 @@
 define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: var_shift_v2i64:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movd       %xmm0, %rax
-; SSE2-NEXT:    movd       %xmm1, %rcx
-; SSE2-NEXT:    sarq       %cl, %rax
-; SSE2-NEXT:    movd       %rax, %xmm2
-; SSE2-NEXT:    pshufd     {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd       %xmm0, %rax
-; SSE2-NEXT:    pshufd     {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movd       %xmm0, %rcx
-; SSE2-NEXT:    sarq       %cl, %rax
-; SSE2-NEXT:    movd       %rax, %xmm0
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    movd %xmm1, %rcx
+; SSE2-NEXT:    sarq %cl, %rax
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rcx
+; SSE2-NEXT:    sarq %cl, %rax
+; SSE2-NEXT:    movd %rax, %xmm0
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT:    movdqa     %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v2i64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrq     $1, %xmm0, %rax
-; SSE41-NEXT:    pextrq     $1, %xmm1, %rcx
-; SSE41-NEXT:    sarq       %cl, %rax
-; SSE41-NEXT:    movd       %rax, %xmm2
-; SSE41-NEXT:    movd       %xmm0, %rax
-; SSE41-NEXT:    movd       %xmm1, %rcx
-; SSE41-NEXT:    sarq       %cl, %rax
-; SSE41-NEXT:    movd       %rax, %xmm0
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    pextrq $1, %xmm1, %rcx
+; SSE41-NEXT:    sarq %cl, %rax
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    movd %xmm1, %rcx
+; SSE41-NEXT:    sarq %cl, %rax
+; SSE41-NEXT:    movd %rax, %xmm0
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: var_shift_v2i64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrq     $1, %xmm0, %rax
-; AVX-NEXT:    vpextrq     $1, %xmm1, %rcx
-; AVX-NEXT:    sarq        %cl, %rax
-; AVX-NEXT:    vmovq       %rax, %xmm2
-; AVX-NEXT:    vmovq       %xmm0, %rax
-; AVX-NEXT:    vmovq       %xmm1, %rcx
-; AVX-NEXT:    sarq        %cl, %rax
-; AVX-NEXT:    vmovq       %rax, %xmm0
+; AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX-NEXT:    sarq %cl, %rax
+; AVX-NEXT:    vmovq %rax, %xmm2
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    vmovq %xmm1, %rcx
+; AVX-NEXT:    sarq %cl, %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX-NEXT:    retq
   %shift = ashr <2 x i64> %a, %b
@@ -56,73 +56,63 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: var_shift_v4i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd      %xmm2, %eax
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; SSE2-NEXT:    movd      %xmm2, %ecx
-; SSE2-NEXT:    sarl      %cl, %eax
-; SSE2-NEXT:    movd      %eax, %xmm2
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd      %xmm3, %eax
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSE2-NEXT:    movd      %xmm3, %ecx
-; SSE2-NEXT:    sarl      %cl, %eax
-; SSE2-NEXT:    movd      %eax, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    movd      %xmm0, %eax
-; SSE2-NEXT:    movd      %xmm1, %ecx
-; SSE2-NEXT:    sarl      %cl, %eax
-; SSE2-NEXT:    movd      %eax, %xmm2
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd      %xmm0, %eax
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movd      %xmm0, %ecx
-; SSE2-NEXT:    sarl      %cl, %eax
-; SSE2-NEXT:    movd      %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    movdqa     %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrad %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlq $32, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrad %xmm2, %xmm4
+; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrad %xmm4, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    psrad %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v4i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrd $1, %xmm0, %eax
-; SSE41-NEXT:    pextrd $1, %xmm1, %ecx
-; SSE41-NEXT:    sarl   %cl, %eax
-; SSE41-NEXT:    movd   %xmm0, %edx
-; SSE41-NEXT:    movd   %xmm1, %ecx
-; SSE41-NEXT:    sarl   %cl, %edx
-; SSE41-NEXT:    movd   %edx, %xmm2
-; SSE41-NEXT:    pinsrd $1, %eax, %xmm2
-; SSE41-NEXT:    pextrd $2, %xmm0, %eax
-; SSE41-NEXT:    pextrd $2, %xmm1, %ecx
-; SSE41-NEXT:    sarl   %cl, %eax
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm2
-; SSE41-NEXT:    pextrd $3, %xmm0, %eax
-; SSE41-NEXT:    pextrd $3, %xmm1, %ecx
-; SSE41-NEXT:    sarl   %cl, %eax
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psrad %xmm2, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad %xmm1, %xmm2
+; SSE41-NEXT:    psrad %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v4i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT:    sarl    %cl, %eax
-; AVX1-NEXT:    vmovd   %xmm0, %edx
-; AVX1-NEXT:    vmovd   %xmm1, %ecx
-; AVX1-NEXT:    sarl    %cl, %edx
-; AVX1-NEXT:    vmovd   %edx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT:    sarl    %cl, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT:    sarl    %cl, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrad %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shift_v4i32:
@@ -136,84 +126,84 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: var_shift_v8i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    psllw  $12, %xmm1
+; SSE2-NEXT:    psllw $12, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psraw  $15, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pandn  %xmm0, %xmm3
-; SSE2-NEXT:    psraw  $8, %xmm0
-; SSE2-NEXT:    pand   %xmm2, %xmm0
-; SSE2-NEXT:    por    %xmm3, %xmm0
-; SSE2-NEXT:    paddw  %xmm1, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psraw  $15, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pandn  %xmm0, %xmm3
-; SSE2-NEXT:    psraw  $4, %xmm0
-; SSE2-NEXT:    pand   %xmm2, %xmm0
-; SSE2-NEXT:    por    %xmm3, %xmm0
-; SSE2-NEXT:    paddw  %xmm1, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psraw  $15, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pandn  %xmm0, %xmm3
-; SSE2-NEXT:    psraw  $2, %xmm0
-; SSE2-NEXT:    pand   %xmm2, %xmm0
-; SSE2-NEXT:    por    %xmm3, %xmm0
-; SSE2-NEXT:    paddw  %xmm1, %xmm1
-; SSE2-NEXT:    psraw  $15, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pandn  %xmm0, %xmm2
-; SSE2-NEXT:    psraw  $1, %xmm0
-; SSE2-NEXT:    pand   %xmm1, %xmm0
-; SSE2-NEXT:    por    %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa   %xmm0, %xmm2
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
-; SSE41-NEXT:    psllw    $12, %xmm0
-; SSE41-NEXT:    psllw    $4, %xmm1
-; SSE41-NEXT:    por      %xmm0, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm3
-; SSE41-NEXT:    paddw    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm2, %xmm4
-; SSE41-NEXT:    psraw    $8, %xmm4
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psraw $8, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm4, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    psraw    $4, %xmm1
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psraw $4, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    psraw    $2, %xmm1
-; SSE41-NEXT:    paddw    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psraw $2, %xmm1
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    psraw    $1, %xmm1
-; SSE41-NEXT:    paddw    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psraw $1, %xmm1
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v8i16:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpsllw    $12, %xmm1, %xmm2
-; AVX1-NEXT:    vpsllw    $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpor      %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddw    %xmm1, %xmm1, %xmm2
-; AVX1-NEXT:    vpsraw    $8, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw    $4, %xmm0, %xmm1
+; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw    $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw    %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw    $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw    %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -221,9 +211,9 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT:    vpsravd   %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb   {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpermq    {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
   %shift = ashr <8 x i16> %a, %b
@@ -234,123 +224,123 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: var_shift_v16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT:    psllw     $5, %xmm1
+; SSE2-NEXT:    psllw $5, %xmm1
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE2-NEXT:    pxor      %xmm3, %xmm3
-; SSE2-NEXT:    pxor      %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtw   %xmm4, %xmm5
-; SSE2-NEXT:    movdqa    %xmm5, %xmm6
-; SSE2-NEXT:    pandn     %xmm2, %xmm6
-; SSE2-NEXT:    psraw     $4, %xmm2
-; SSE2-NEXT:    pand      %xmm5, %xmm2
-; SSE2-NEXT:    por       %xmm6, %xmm2
-; SSE2-NEXT:    paddw     %xmm4, %xmm4
-; SSE2-NEXT:    pxor      %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtw   %xmm4, %xmm5
-; SSE2-NEXT:    movdqa    %xmm5, %xmm6
-; SSE2-NEXT:    pandn     %xmm2, %xmm6
-; SSE2-NEXT:    psraw     $2, %xmm2
-; SSE2-NEXT:    pand      %xmm5, %xmm2
-; SSE2-NEXT:    por       %xmm6, %xmm2
-; SSE2-NEXT:    paddw     %xmm4, %xmm4
-; SSE2-NEXT:    pxor      %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtw   %xmm4, %xmm5
-; SSE2-NEXT:    movdqa    %xmm5, %xmm4
-; SSE2-NEXT:    pandn     %xmm2, %xmm4
-; SSE2-NEXT:    psraw     $1, %xmm2
-; SSE2-NEXT:    pand      %xmm5, %xmm2
-; SSE2-NEXT:    por       %xmm4, %xmm2
-; SSE2-NEXT:    psrlw     $8, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    psraw $4, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm2, %xmm6
+; SSE2-NEXT:    psraw $2, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    psraw $1, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pxor      %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtw   %xmm1, %xmm4
-; SSE2-NEXT:    movdqa    %xmm4, %xmm5
-; SSE2-NEXT:    pandn     %xmm0, %xmm5
-; SSE2-NEXT:    psraw     $4, %xmm0
-; SSE2-NEXT:    pand      %xmm4, %xmm0
-; SSE2-NEXT:    por       %xmm5, %xmm0
-; SSE2-NEXT:    paddw     %xmm1, %xmm1
-; SSE2-NEXT:    pxor      %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtw   %xmm1, %xmm4
-; SSE2-NEXT:    movdqa    %xmm4, %xmm5
-; SSE2-NEXT:    pandn     %xmm0, %xmm5
-; SSE2-NEXT:    psraw     $2, %xmm0
-; SSE2-NEXT:    pand      %xmm4, %xmm0
-; SSE2-NEXT:    por       %xmm5, %xmm0
-; SSE2-NEXT:    paddw     %xmm1, %xmm1
-; SSE2-NEXT:    pcmpgtw   %xmm1, %xmm3
-; SSE2-NEXT:    movdqa    %xmm3, %xmm1
-; SSE2-NEXT:    pandn     %xmm0, %xmm1
-; SSE2-NEXT:    psraw     $1, %xmm0
-; SSE2-NEXT:    pand      %xmm3, %xmm0
-; SSE2-NEXT:    por       %xmm1, %xmm0
-; SSE2-NEXT:    psrlw     $8, %xmm0
-; SSE2-NEXT:    packuswb  %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v16i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa    %xmm0, %xmm2
-; SSE41-NEXT:    psllw     $5, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm1
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE41-NEXT:    movdqa    %xmm3, %xmm4
-; SSE41-NEXT:    psraw     $4, %xmm4
-; SSE41-NEXT:    pblendvb  %xmm4, %xmm3
-; SSE41-NEXT:    movdqa    %xmm3, %xmm4
-; SSE41-NEXT:    psraw     $2, %xmm4
-; SSE41-NEXT:    paddw     %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb  %xmm4, %xmm3
-; SSE41-NEXT:    movdqa    %xmm3, %xmm4
-; SSE41-NEXT:    psraw     $1, %xmm4
-; SSE41-NEXT:    paddw     %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb  %xmm4, %xmm3
-; SSE41-NEXT:    psrlw     $8, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $4, %xmm4
+; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $2, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $1, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE41-NEXT:    movdqa    %xmm1, %xmm2
-; SSE41-NEXT:    psraw     $4, %xmm2
-; SSE41-NEXT:    pblendvb  %xmm2, %xmm1
-; SSE41-NEXT:    movdqa    %xmm1, %xmm2
-; SSE41-NEXT:    psraw     $2, %xmm2
-; SSE41-NEXT:    paddw     %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb  %xmm2, %xmm1
-; SSE41-NEXT:    movdqa    %xmm1, %xmm2
-; SSE41-NEXT:    psraw     $1, %xmm2
-; SSE41-NEXT:    paddw     %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb  %xmm2, %xmm1
-; SSE41-NEXT:    psrlw     $8, %xmm1
-; SSE41-NEXT:    packuswb  %xmm3, %xmm1
-; SSE41-NEXT:    movdqa    %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: var_shift_v16i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpsllw     $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT:    vpsraw     $4, %xmm3, %xmm4
-; AVX-NEXT:    vpblendvb  %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT:    vpsraw     $2, %xmm3, %xmm4
-; AVX-NEXT:    vpaddw     %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendvb  %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT:    vpsraw     $1, %xmm3, %xmm4
-; AVX-NEXT:    vpaddw     %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendvb  %xmm2, %xmm4, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw     $8, %xmm2, %xmm2
+; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT:    vpsraw     $4, %xmm0, %xmm3
-; AVX-NEXT:    vpblendvb  %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpsraw     $2, %xmm0, %xmm3
-; AVX-NEXT:    vpaddw     %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendvb  %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpsraw     $1, %xmm0, %xmm3
-; AVX-NEXT:    vpaddw     %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendvb  %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw     $8, %xmm0, %xmm0
-; AVX-NEXT:    vpackuswb  %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shift = ashr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -363,61 +353,61 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: splatvar_shift_v2i64:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd     {{.*#+}} xmm2 = xmm1[0,1,0,1]
-; SSE2-NEXT:    movd       %xmm0, %rax
-; SSE2-NEXT:    movd       %xmm2, %rcx
-; SSE2-NEXT:    sarq       %cl, %rax
-; SSE2-NEXT:    movd       %rax, %xmm1
-; SSE2-NEXT:    pshufd     {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd       %xmm0, %rax
-; SSE2-NEXT:    pshufd     {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT:    movd       %xmm0, %rcx
-; SSE2-NEXT:    sarq       %cl, %rax
-; SSE2-NEXT:    movd       %rax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    movd %xmm2, %rcx
+; SSE2-NEXT:    sarq %cl, %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rcx
+; SSE2-NEXT:    sarq %cl, %rax
+; SSE2-NEXT:    movd %rax, %xmm0
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movdqa     %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: splatvar_shift_v2i64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pshufd     {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT:    pextrq     $1, %xmm0, %rax
-; SSE41-NEXT:    pextrq     $1, %xmm1, %rcx
-; SSE41-NEXT:    sarq       %cl, %rax
-; SSE41-NEXT:    movd       %rax, %xmm2
-; SSE41-NEXT:    movd       %xmm0, %rax
-; SSE41-NEXT:    movd       %xmm1, %rcx
-; SSE41-NEXT:    sarq       %cl, %rax
-; SSE41-NEXT:    movd       %rax, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    pextrq $1, %xmm1, %rcx
+; SSE41-NEXT:    sarq %cl, %rax
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    movd %xmm1, %rcx
+; SSE41-NEXT:    sarq %cl, %rax
+; SSE41-NEXT:    movd %rax, %xmm0
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: splatvar_shift_v2i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpshufd     {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT:    vpextrq     $1, %xmm0, %rax
-; AVX1-NEXT:    vpextrq     $1, %xmm1, %rcx
-; AVX1-NEXT:    sarq        %cl, %rax
-; AVX1-NEXT:    vmovq       %rax, %xmm2
-; AVX1-NEXT:    vmovq       %xmm0, %rax
-; AVX1-NEXT:    vmovq       %xmm1, %rcx
-; AVX1-NEXT:    sarq        %cl, %rax
-; AVX1-NEXT:    vmovq       %rax, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT:    sarq %cl, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vmovq %xmm1, %rcx
+; AVX1-NEXT:    sarq %cl, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_shift_v2i64:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT:    vpextrq      $1, %xmm0, %rax
-; AVX2-NEXT:    vpextrq      $1, %xmm1, %rcx
-; AVX2-NEXT:    sarq         %cl, %rax
-; AVX2-NEXT:    vmovq        %rax, %xmm2
-; AVX2-NEXT:    vmovq        %xmm0, %rax
-; AVX2-NEXT:    vmovq        %xmm1, %rcx
-; AVX2-NEXT:    sarq         %cl, %rax
-; AVX2-NEXT:    vmovq        %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq  {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT:    sarq %cl, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm1, %rcx
+; AVX2-NEXT:    sarq %cl, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX2-NEXT:    retq
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
   %shift = ashr <2 x i64> %a, %splat
@@ -453,10 +443,10 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: splatvar_shift_v8i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movd   %xmm1, %eax
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    movzwl %ax, %eax
-; SSE2-NEXT:    movd   %eax, %xmm1
-; SSE2-NEXT:    psraw  %xmm1, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    psraw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: splatvar_shift_v8i16:
@@ -481,160 +471,160 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: splatvar_shift_v16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSE2-NEXT:    pshuflw   {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT:    pshufhw   {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT:    psllw     $5, %xmm3
+; SSE2-NEXT:    psllw $5, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
-; SSE2-NEXT:    pxor      %xmm2, %xmm2
-; SSE2-NEXT:    pxor      %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtw   %xmm4, %xmm5
-; SSE2-NEXT:    movdqa    %xmm5, %xmm6
-; SSE2-NEXT:    pandn     %xmm1, %xmm6
-; SSE2-NEXT:    psraw     $4, %xmm1
-; SSE2-NEXT:    pand      %xmm5, %xmm1
-; SSE2-NEXT:    por       %xmm6, %xmm1
-; SSE2-NEXT:    paddw     %xmm4, %xmm4
-; SSE2-NEXT:    pxor      %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtw   %xmm4, %xmm5
-; SSE2-NEXT:    movdqa    %xmm5, %xmm6
-; SSE2-NEXT:    pandn     %xmm1, %xmm6
-; SSE2-NEXT:    psraw     $2, %xmm1
-; SSE2-NEXT:    pand      %xmm5, %xmm1
-; SSE2-NEXT:    por       %xmm6, %xmm1
-; SSE2-NEXT:    paddw     %xmm4, %xmm4
-; SSE2-NEXT:    pxor      %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtw   %xmm4, %xmm5
-; SSE2-NEXT:    movdqa    %xmm5, %xmm4
-; SSE2-NEXT:    pandn     %xmm1, %xmm4
-; SSE2-NEXT:    psraw     $1, %xmm1
-; SSE2-NEXT:    pand      %xmm5, %xmm1
-; SSE2-NEXT:    por       %xmm4, %xmm1
-; SSE2-NEXT:    psrlw     $8, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}}  xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklbw {{.*#+}}  xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pxor      %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtw   %xmm3, %xmm4
-; SSE2-NEXT:    movdqa    %xmm4, %xmm5
-; SSE2-NEXT:    pandn     %xmm0, %xmm5
-; SSE2-NEXT:    psraw     $4, %xmm0
-; SSE2-NEXT:    pand      %xmm4, %xmm0
-; SSE2-NEXT:    por       %xmm5, %xmm0
-; SSE2-NEXT:    paddw     %xmm3, %xmm3
-; SSE2-NEXT:    pxor      %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtw   %xmm3, %xmm4
-; SSE2-NEXT:    movdqa    %xmm4, %xmm5
-; SSE2-NEXT:    pandn     %xmm0, %xmm5
-; SSE2-NEXT:    psraw     $2, %xmm0
-; SSE2-NEXT:    pand      %xmm4, %xmm0
-; SSE2-NEXT:    por       %xmm5, %xmm0
-; SSE2-NEXT:    paddw     %xmm3, %xmm3
-; SSE2-NEXT:    pcmpgtw   %xmm3, %xmm2
-; SSE2-NEXT:    movdqa    %xmm2, %xmm3
-; SSE2-NEXT:    pandn     %xmm0, %xmm3
-; SSE2-NEXT:    psraw     $1, %xmm0
-; SSE2-NEXT:    pand      %xmm2, %xmm0
-; SSE2-NEXT:    por       %xmm3, %xmm0
-; SSE2-NEXT:    psrlw     $8, %xmm0
-; SSE2-NEXT:    packuswb  %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    psraw $4, %xmm1
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    psraw $2, %xmm1
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    psraw $1, %xmm1
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: splatvar_shift_v16i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa      %xmm0, %xmm2
-; SSE41-NEXT:    pxor        %xmm0, %xmm0
-; SSE41-NEXT:    pshufb      %xmm0, %xmm1
-; SSE41-NEXT:    psllw       $5, %xmm1
-; SSE41-NEXT:    punpckhbw   {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT:    punpckhbw   {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE41-NEXT:    movdqa      %xmm3, %xmm4
-; SSE41-NEXT:    psraw       $4, %xmm4
-; SSE41-NEXT:    pblendvb    %xmm4, %xmm3
-; SSE41-NEXT:    movdqa      %xmm3, %xmm4
-; SSE41-NEXT:    psraw       $2, %xmm4
-; SSE41-NEXT:    paddw       %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb    %xmm4, %xmm3
-; SSE41-NEXT:    movdqa      %xmm3, %xmm4
-; SSE41-NEXT:    psraw       $1, %xmm4
-; SSE41-NEXT:    paddw       %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb    %xmm4, %xmm3
-; SSE41-NEXT:    psrlw       $8, %xmm3
-; SSE41-NEXT:    punpcklbw   {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE41-NEXT:    punpcklbw   {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE41-NEXT:    movdqa      %xmm1, %xmm2
-; SSE41-NEXT:    psraw       $4, %xmm2
-; SSE41-NEXT:    pblendvb    %xmm2, %xmm1
-; SSE41-NEXT:    movdqa      %xmm1, %xmm2
-; SSE41-NEXT:    psraw       $2, %xmm2
-; SSE41-NEXT:    paddw       %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb    %xmm2, %xmm1
-; SSE41-NEXT:    movdqa      %xmm1, %xmm2
-; SSE41-NEXT:    psraw       $1, %xmm2
-; SSE41-NEXT:    paddw       %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb    %xmm2, %xmm1
-; SSE41-NEXT:    psrlw       $8, %xmm1
-; SSE41-NEXT:    packuswb    %xmm3, %xmm1
-; SSE41-NEXT:    movdqa      %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm1
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $4, %xmm4
+; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $2, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psraw $1, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: splatvar_shift_v16i8:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor      %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb    %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllw     $5, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpsraw     $4, %xmm3, %xmm4
-; AVX1-NEXT:    vpblendvb  %xmm2, %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsraw     $2, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddw     %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb  %xmm2, %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsraw     $1, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddw     %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb  %xmm2, %xmm4, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw     $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpsraw $4, %xmm3, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsraw $2, %xmm3, %xmm4
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm4
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpsraw     $4, %xmm0, %xmm3
-; AVX1-NEXT:    vpblendvb  %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw     $2, %xmm0, %xmm3
-; AVX1-NEXT:    vpaddw     %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendvb  %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw     $1, %xmm0, %xmm3
-; AVX1-NEXT:    vpaddw     %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendvb  %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw     $8, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb  %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm3
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_shift_v16i8:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT:    vpsllw       $5, %xmm1, %xmm1
-; AVX2-NEXT:    vpunpckhbw   {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-NEXT:    vpunpckhbw   {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX2-NEXT:    vpsraw       $4, %xmm3, %xmm4
-; AVX2-NEXT:    vpblendvb    %xmm2, %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpsraw       $2, %xmm3, %xmm4
-; AVX2-NEXT:    vpaddw       %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendvb    %xmm2, %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpsraw       $1, %xmm3, %xmm4
-; AVX2-NEXT:    vpaddw       %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendvb    %xmm2, %xmm4, %xmm3, %xmm2
-; AVX2-NEXT:    vpsrlw       $8, %xmm2, %xmm2
-; AVX2-NEXT:    vpunpcklbw   {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT:    vpunpcklbw   {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpsraw       $4, %xmm0, %xmm3
-; AVX2-NEXT:    vpblendvb    %xmm1, %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpsraw       $2, %xmm0, %xmm3
-; AVX2-NEXT:    vpaddw       %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendvb    %xmm1, %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpsraw       $1, %xmm0, %xmm3
-; AVX2-NEXT:    vpaddw       %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendvb    %xmm1, %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrlw       $8, %xmm0, %xmm0
-; AVX2-NEXT:    vpackuswb    %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpsraw $4, %xmm3, %xmm4
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpsraw $2, %xmm3, %xmm4
+; AVX2-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpsraw $1, %xmm3, %xmm4
+; AVX2-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm3
+; AVX2-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX2-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = ashr <16 x i8> %a, %splat
@@ -648,36 +638,36 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; SSE2-LABEL: constant_shift_v2i64:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movd       %xmm0, %rax
-; SSE2-NEXT:    sarq       %rax
-; SSE2-NEXT:    movd       %rax, %xmm1
-; SSE2-NEXT:    pshufd     {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd       %xmm0, %rax
-; SSE2-NEXT:    sarq       $7, %rax
-; SSE2-NEXT:    movd       %rax, %xmm0
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    sarq %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    sarq $7, %rax
+; SSE2-NEXT:    movd %rax, %xmm0
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movdqa     %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v2i64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrq     $1, %xmm0, %rax
-; SSE41-NEXT:    sarq       $7, %rax
-; SSE41-NEXT:    movd       %rax, %xmm1
-; SSE41-NEXT:    movd       %xmm0, %rax
-; SSE41-NEXT:    sarq       %rax
-; SSE41-NEXT:    movd       %rax, %xmm0
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    sarq $7, %rax
+; SSE41-NEXT:    movd %rax, %xmm1
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    sarq %rax
+; SSE41-NEXT:    movd %rax, %xmm0
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: constant_shift_v2i64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrq     $1, %xmm0, %rax
-; AVX-NEXT:    sarq        $7, %rax
-; AVX-NEXT:    vmovq       %rax, %xmm1
-; AVX-NEXT:    vmovq       %xmm0, %rax
-; AVX-NEXT:    sarq        %rax
-; AVX-NEXT:    vmovq       %rax, %xmm0
+; AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX-NEXT:    sarq $7, %rax
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    sarq %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
   %shift = ashr <2 x i64> %a, <i64 1, i64 7>
@@ -687,58 +677,43 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; SSE2-LABEL: constant_shift_v4i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd      %xmm1, %eax
-; SSE2-NEXT:    sarl      $7, %eax
-; SSE2-NEXT:    movd      %eax, %xmm1
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd      %xmm2, %eax
-; SSE2-NEXT:    sarl      $5, %eax
-; SSE2-NEXT:    movd      %eax, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movd      %xmm0, %eax
-; SSE2-NEXT:    sarl      $4, %eax
-; SSE2-NEXT:    movd      %eax, %xmm1
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd      %xmm0, %eax
-; SSE2-NEXT:    sarl      $6, %eax
-; SSE2-NEXT:    movd      %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movdqa    %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $7, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $5, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $6, %xmm2
+; SSE2-NEXT:    psrad $4, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v4i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrd $1, %xmm0, %eax
-; SSE41-NEXT:    sarl   $5, %eax
-; SSE41-NEXT:    movd   %xmm0, %ecx
-; SSE41-NEXT:    sarl   $4, %ecx
-; SSE41-NEXT:    movd   %ecx, %xmm1
-; SSE41-NEXT:    pinsrd $1, %eax, %xmm1
-; SSE41-NEXT:    pextrd $2, %xmm0, %eax
-; SSE41-NEXT:    sarl   $6, %eax
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm1
-; SSE41-NEXT:    pextrd $3, %xmm0, %eax
-; SSE41-NEXT:    sarl   $7, %eax
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $7, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $5, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $6, %xmm1
+; SSE41-NEXT:    psrad $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v4i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    sarl    $5, %eax
-; AVX1-NEXT:    vmovd   %xmm0, %ecx
-; AVX1-NEXT:    sarl    $4, %ecx
-; AVX1-NEXT:    vmovd   %ecx, %xmm1
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    sarl    $6, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    sarl    $7, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpsrad $6, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v4i32:
@@ -752,56 +727,56 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; SSE2-LABEL: constant_shift_v8i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa    %xmm0, %xmm1
-; SSE2-NEXT:    psraw     $4, %xmm1
-; SSE2-NEXT:    movsd     {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; SSE2-NEXT:    psraw     $2, %xmm1
-; SSE2-NEXT:    pshufd    {{.*#+}}  xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psraw $4, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; SSE2-NEXT:    psraw $2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    movdqa    {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT:    movdqa    %xmm2, %xmm1
-; SSE2-NEXT:    pand      %xmm0, %xmm1
-; SSE2-NEXT:    psraw     $1, %xmm2
-; SSE2-NEXT:    pandn     %xmm2, %xmm0
-; SSE2-NEXT:    por       %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    psraw $1, %xmm2
+; SSE2-NEXT:    pandn %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa   %xmm0, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psraw    $8, %xmm2
-; SSE41-NEXT:    movaps   {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $8, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psraw    $4, %xmm2
-; SSE41-NEXT:    movaps   {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $4, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psraw    $2, %xmm2
-; SSE41-NEXT:    movaps   {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $2, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psraw    $1, %xmm2
-; SSE41-NEXT:    movaps   {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v8i16:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpsraw    $8, %xmm0, %xmm1
-; AVX1-NEXT:    vmovdqa   {{.*}}(%rip), %xmm2  # xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw    $4, %xmm0, %xmm1
-; AVX1-NEXT:    vmovdqa   {{.*}}(%rip), %xmm2  # xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
+; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw    $2, %xmm0, %xmm1
-; AVX1-NEXT:    vmovdqa   {{.*}}(%rip), %xmm2  # xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
+; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw    $1, %xmm0, %xmm1
-; AVX1-NEXT:    vmovdqa   {{.*}}(%rip), %xmm2  # xmm2 = [0,32896,256,33152,512,33408,768,33664]
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664]
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -809,9 +784,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpsravd   %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb   {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpermq    {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
   %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
@@ -822,126 +797,126 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; SSE2-LABEL: constant_shift_v16i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT:    movdqa    {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; SSE2-NEXT:    psllw     $5, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE2-NEXT:    psllw $5, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
-; SSE2-NEXT:    pxor      %xmm2, %xmm2
-; SSE2-NEXT:    pxor      %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtw   %xmm4, %xmm5
-; SSE2-NEXT:    movdqa    %xmm5, %xmm6
-; SSE2-NEXT:    pandn     %xmm1, %xmm6
-; SSE2-NEXT:    psraw     $4, %xmm1
-; SSE2-NEXT:    pand      %xmm5, %xmm1
-; SSE2-NEXT:    por       %xmm6, %xmm1
-; SSE2-NEXT:    paddw     %xmm4, %xmm4
-; SSE2-NEXT:    pxor      %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtw   %xmm4, %xmm5
-; SSE2-NEXT:    movdqa    %xmm5, %xmm6
-; SSE2-NEXT:    pandn     %xmm1, %xmm6
-; SSE2-NEXT:    psraw     $2, %xmm1
-; SSE2-NEXT:    pand      %xmm5, %xmm1
-; SSE2-NEXT:    por       %xmm6, %xmm1
-; SSE2-NEXT:    paddw     %xmm4, %xmm4
-; SSE2-NEXT:    pxor      %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtw   %xmm4, %xmm5
-; SSE2-NEXT:    movdqa    %xmm5, %xmm4
-; SSE2-NEXT:    pandn     %xmm1, %xmm4
-; SSE2-NEXT:    psraw     $1, %xmm1
-; SSE2-NEXT:    pand      %xmm5, %xmm1
-; SSE2-NEXT:    por       %xmm4, %xmm1
-; SSE2-NEXT:    psrlw     $8, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    psraw $4, %xmm1
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pandn %xmm1, %xmm6
+; SSE2-NEXT:    psraw $2, %xmm1
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    paddw %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    psraw $1, %xmm1
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pxor      %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtw   %xmm3, %xmm4
-; SSE2-NEXT:    movdqa    %xmm4, %xmm5
-; SSE2-NEXT:    pandn     %xmm0, %xmm5
-; SSE2-NEXT:    psraw     $4, %xmm0
-; SSE2-NEXT:    pand      %xmm4, %xmm0
-; SSE2-NEXT:    por       %xmm5, %xmm0
-; SSE2-NEXT:    paddw     %xmm3, %xmm3
-; SSE2-NEXT:    pxor      %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtw   %xmm3, %xmm4
-; SSE2-NEXT:    movdqa    %xmm4, %xmm5
-; SSE2-NEXT:    pandn     %xmm0, %xmm5
-; SSE2-NEXT:    psraw     $2, %xmm0
-; SSE2-NEXT:    pand      %xmm4, %xmm0
-; SSE2-NEXT:    por       %xmm5, %xmm0
-; SSE2-NEXT:    paddw     %xmm3, %xmm3
-; SSE2-NEXT:    pcmpgtw   %xmm3, %xmm2
-; SSE2-NEXT:    movdqa    %xmm2, %xmm3
-; SSE2-NEXT:    pandn     %xmm0, %xmm3
-; SSE2-NEXT:    psraw     $1, %xmm0
-; SSE2-NEXT:    pand      %xmm2, %xmm0
-; SSE2-NEXT:    por       %xmm3, %xmm0
-; SSE2-NEXT:    psrlw     $8, %xmm0
-; SSE2-NEXT:    packuswb  %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v16i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa    %xmm0, %xmm1
-; SSE41-NEXT:    movdqa    {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; SSE41-NEXT:    psllw     $5, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE41-NEXT:    psllw $5, %xmm3
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE41-NEXT:    movdqa    %xmm2, %xmm4
-; SSE41-NEXT:    psraw     $4, %xmm4
-; SSE41-NEXT:    pblendvb  %xmm4, %xmm2
-; SSE41-NEXT:    movdqa    %xmm2, %xmm4
-; SSE41-NEXT:    psraw     $2, %xmm4
-; SSE41-NEXT:    paddw     %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb  %xmm4, %xmm2
-; SSE41-NEXT:    movdqa    %xmm2, %xmm4
-; SSE41-NEXT:    psraw     $1, %xmm4
-; SSE41-NEXT:    paddw     %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb  %xmm4, %xmm2
-; SSE41-NEXT:    psrlw     $8, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psraw $4, %xmm4
+; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psraw $2, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psraw $1, %xmm4
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    movdqa    %xmm1, %xmm3
-; SSE41-NEXT:    psraw     $4, %xmm3
-; SSE41-NEXT:    pblendvb  %xmm3, %xmm1
-; SSE41-NEXT:    movdqa    %xmm1, %xmm3
-; SSE41-NEXT:    psraw     $2, %xmm3
-; SSE41-NEXT:    paddw     %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb  %xmm3, %xmm1
-; SSE41-NEXT:    movdqa    %xmm1, %xmm3
-; SSE41-NEXT:    psraw     $1, %xmm3
-; SSE41-NEXT:    paddw     %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb  %xmm3, %xmm1
-; SSE41-NEXT:    psrlw     $8, %xmm1
-; SSE41-NEXT:    packuswb  %xmm2, %xmm1
-; SSE41-NEXT:    movdqa    %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psraw $4, %xmm3
+; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psraw $2, %xmm3
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psraw $1, %xmm3
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    packuswb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: constant_shift_v16i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovdqa    {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX-NEXT:    vpsllw     $5, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT:    vpsraw     $4, %xmm3, %xmm4
-; AVX-NEXT:    vpblendvb  %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT:    vpsraw     $2, %xmm3, %xmm4
-; AVX-NEXT:    vpaddw     %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendvb  %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT:    vpsraw     $1, %xmm3, %xmm4
-; AVX-NEXT:    vpaddw     %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendvb  %xmm2, %xmm4, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw     $8, %xmm2, %xmm2
+; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT:    vpsraw     $4, %xmm0, %xmm3
-; AVX-NEXT:    vpblendvb  %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpsraw     $2, %xmm0, %xmm3
-; AVX-NEXT:    vpaddw     %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendvb  %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpsraw     $1, %xmm0, %xmm3
-; AVX-NEXT:    vpaddw     %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendvb  %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw     $8, %xmm0, %xmm0
-; AVX-NEXT:    vpackuswb  %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -954,38 +929,35 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
 ; SSE2-LABEL: splatconstant_shift_v2i64:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movd       %xmm0, %rax
-; SSE2-NEXT:    sarq       $7, %rax
-; SSE2-NEXT:    movd       %rax, %xmm1
-; SSE2-NEXT:    pshufd     {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd       %xmm0, %rax
-; SSE2-NEXT:    sarq       $7, %rax
-; SSE2-NEXT:    movd       %rax, %xmm0
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movdqa     %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $7, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrlq $7, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: splatconstant_shift_v2i64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrq     $1, %xmm0, %rax
-; SSE41-NEXT:    sarq       $7, %rax
-; SSE41-NEXT:    movd       %rax, %xmm1
-; SSE41-NEXT:    movd       %xmm0, %rax
-; SSE41-NEXT:    sarq       $7, %rax
-; SSE41-NEXT:    movd       %rax, %xmm0
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $7, %xmm1
+; SSE41-NEXT:    psrlq $7, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: splatconstant_shift_v2i64:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpextrq     $1, %xmm0, %rax
-; AVX-NEXT:    sarq        $7, %rax
-; AVX-NEXT:    vmovq       %rax, %xmm1
-; AVX-NEXT:    vmovq       %xmm0, %rax
-; AVX-NEXT:    sarq        $7, %rax
-; AVX-NEXT:    vmovq       %rax, %xmm0
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatconstant_shift_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_shift_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrad $7, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlq $7, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    retq
   %shift = ashr <2 x i64> %a, <i64 7, i64 7>
   ret <2 x i64> %shift
 }
@@ -1021,20 +993,20 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; SSE-LABEL: splatconstant_shift_v16i8:
 ; SSE:       # BB#0:
-; SSE-NEXT:    psrlw  $3, %xmm0
-; SSE-NEXT:    pand   {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psrlw $3, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; SSE-NEXT:    pxor   %xmm1, %xmm0
-; SSE-NEXT:    psubb  %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: splatconstant_shift_v16i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpsrlw  $3, %xmm0, %xmm0
-; AVX-NEXT:    vpand   {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT:    vpxor   %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsubb  %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 3fc377af5650..e4642558e0e4 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -63,39 +63,30 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: var_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpextrd $1, %xmm2, %eax
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpextrd $1, %xmm3, %ecx
-; AVX1-NEXT:    sarl %cl, %eax
-; AVX1-NEXT:    vmovd %xmm2, %edx
-; AVX1-NEXT:    vmovd %xmm3, %ecx
-; AVX1-NEXT:    sarl %cl, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm4
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm4, %xmm4
-; AVX1-NEXT:    vpextrd $2, %xmm2, %eax
-; AVX1-NEXT:    vpextrd $2, %xmm3, %ecx
-; AVX1-NEXT:    sarl %cl, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm4, %xmm4
-; AVX1-NEXT:    vpextrd $3, %xmm2, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm3, %ecx
-; AVX1-NEXT:    sarl %cl, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm2
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT:    sarl %cl, %eax
-; AVX1-NEXT:    vmovd %xmm0, %edx
-; AVX1-NEXT:    vmovd %xmm1, %ecx
-; AVX1-NEXT:    sarl %cl, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm3
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT:    sarl %cl, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT:    sarl %cl, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrad %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
+; AVX1-NEXT:    vpsrad %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX1-NEXT:    vpsrad %xmm6, %xmm2, %xmm6
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpsrad %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrad %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX1-NEXT:    vpsrad %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -489,32 +480,20 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; AVX1-LABEL: constant_shift_v8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrd $1, %xmm1, %eax
-; AVX1-NEXT:    sarl $9, %eax
-; AVX1-NEXT:    vmovd %xmm1, %ecx
-; AVX1-NEXT:    sarl $8, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm1, %eax
-; AVX1-NEXT:    sarl $8, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
-; AVX1-NEXT:    sarl $7, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    sarl $5, %eax
-; AVX1-NEXT:    vmovd %xmm0, %ecx
-; AVX1-NEXT:    sarl $4, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    sarl $6, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    sarl $7, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpsrad $6, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $9, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v8i32:
@@ -663,41 +642,20 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; AVX1-LABEL: splatconstant_shift_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    sarq $7, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    sarq $7, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    sarq $7, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    sarq $7, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    vpsrad $7, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlq $7, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatconstant_shift_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    sarq $7, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    sarq $7, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    sarq $7, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    sarq $7, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $7, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-NEXT:    retq
   %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
@@ -756,11 +714,11 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ;
 ; AVX2-LABEL: splatconstant_shift_v32i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpsrlw  $3, %ymm0, %ymm0
-; AVX2-NEXT:    vpand   {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpxor   %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubb  %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index f5a7e28383fe..ca55800e2713 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -12,26 +12,26 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrlq  %xmm3, %xmm2
-; SSE2-NEXT:    psrlq  %xmm1, %xmm0
-; SSE2-NEXT:    movsd  {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT:    psrlq %xmm3, %xmm2
+; SSE2-NEXT:    psrlq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
 ; SSE2-NEXT:    movapd %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v2i64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa  %xmm0, %xmm2
-; SSE41-NEXT:    psrlq   %xmm1, %xmm2
-; SSE41-NEXT:    pshufd  {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    psrlq   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psrlq %xmm1, %xmm0
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v2i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpsrlq   %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpshufd  {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpsrlq   %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
@@ -46,73 +46,63 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: var_shift_v4i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd      %xmm2, %eax
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; SSE2-NEXT:    movd      %xmm2, %ecx
-; SSE2-NEXT:    shrl      %cl, %eax
-; SSE2-NEXT:    movd      %eax, %xmm2
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd      %xmm3, %eax
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSE2-NEXT:    movd      %xmm3, %ecx
-; SSE2-NEXT:    shrl      %cl, %eax
-; SSE2-NEXT:    movd      %eax, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    movd      %xmm0, %eax
-; SSE2-NEXT:    movd      %xmm1, %ecx
-; SSE2-NEXT:    shrl      %cl, %eax
-; SSE2-NEXT:    movd      %eax, %xmm2
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd      %xmm0, %eax
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movd      %xmm0, %ecx
-; SSE2-NEXT:    shrl      %cl, %eax
-; SSE2-NEXT:    movd      %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    movdqa     %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrld %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlq $32, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrld %xmm2, %xmm4
+; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrld %xmm4, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    psrld %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v4i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrd $1, %xmm0, %eax
-; SSE41-NEXT:    pextrd $1, %xmm1, %ecx
-; SSE41-NEXT:    shrl   %cl, %eax
-; SSE41-NEXT:    movd   %xmm0, %edx
-; SSE41-NEXT:    movd   %xmm1, %ecx
-; SSE41-NEXT:    shrl   %cl, %edx
-; SSE41-NEXT:    movd   %edx, %xmm2
-; SSE41-NEXT:    pinsrd $1, %eax, %xmm2
-; SSE41-NEXT:    pextrd $2, %xmm0, %eax
-; SSE41-NEXT:    pextrd $2, %xmm1, %ecx
-; SSE41-NEXT:    shrl   %cl, %eax
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm2
-; SSE41-NEXT:    pextrd $3, %xmm0, %eax
-; SSE41-NEXT:    pextrd $3, %xmm1, %ecx
-; SSE41-NEXT:    shrl   %cl, %eax
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psrld %xmm2, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrld %xmm1, %xmm2
+; SSE41-NEXT:    psrld %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v4i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT:    shrl    %cl, %eax
-; AVX1-NEXT:    vmovd   %xmm0, %edx
-; AVX1-NEXT:    vmovd   %xmm1, %ecx
-; AVX1-NEXT:    shrl    %cl, %edx
-; AVX1-NEXT:    vmovd   %edx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT:    shrl    %cl, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT:    shrl    %cl, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shift_v4i32:
@@ -126,84 +116,84 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: var_shift_v8i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    psllw  $12, %xmm1
+; SSE2-NEXT:    psllw $12, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psraw  $15, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pandn  %xmm0, %xmm3
-; SSE2-NEXT:    psrlw  $8, %xmm0
-; SSE2-NEXT:    pand   %xmm2, %xmm0
-; SSE2-NEXT:    por    %xmm3, %xmm0
-; SSE2-NEXT:    paddw  %xmm1, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psraw  $15, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pandn  %xmm0, %xmm3
-; SSE2-NEXT:    psrlw  $4, %xmm0
-; SSE2-NEXT:    pand   %xmm2, %xmm0
-; SSE2-NEXT:    por    %xmm3, %xmm0
-; SSE2-NEXT:    paddw  %xmm1, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psraw  $15, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pandn  %xmm0, %xmm3
-; SSE2-NEXT:    psrlw  $2, %xmm0
-; SSE2-NEXT:    pand   %xmm2, %xmm0
-; SSE2-NEXT:    por    %xmm3, %xmm0
-; SSE2-NEXT:    paddw  %xmm1, %xmm1
-; SSE2-NEXT:    psraw  $15, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pandn  %xmm0, %xmm2
-; SSE2-NEXT:    psrlw  $1, %xmm0
-; SSE2-NEXT:    pand   %xmm1, %xmm0
-; SSE2-NEXT:    por    %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa   %xmm0, %xmm2
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
-; SSE41-NEXT:    psllw    $12, %xmm0
-; SSE41-NEXT:    psllw    $4, %xmm1
-; SSE41-NEXT:    por      %xmm0, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm3
-; SSE41-NEXT:    paddw    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm2, %xmm4
-; SSE41-NEXT:    psrlw    $8, %xmm4
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psrlw $8, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm4, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    psrlw    $4, %xmm1
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $4, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    psrlw    $2, %xmm1
-; SSE41-NEXT:    paddw    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $2, %xmm1
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    psrlw    $1, %xmm1
-; SSE41-NEXT:    paddw    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $1, %xmm1
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v8i16:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpsllw    $12, %xmm1, %xmm2
-; AVX1-NEXT:    vpsllw    $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpor      %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddw    %xmm1, %xmm1, %xmm2
-; AVX1-NEXT:    vpsrlw    $8, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw    $4, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw    $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw    %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw    $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw    %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -211,9 +201,9 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpsrlvd   %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb   {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpermq    {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
   %shift = lshr <8 x i16> %a, %b
@@ -223,72 +213,72 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: var_shift_v16i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:  psllw   $5, %xmm1
-; SSE2-NEXT:  pxor    %xmm2, %xmm2
-; SSE2-NEXT:  pxor    %xmm3, %xmm3
-; SSE2-NEXT:  pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT:  movdqa  %xmm3, %xmm4
-; SSE2-NEXT:  pandn   %xmm0, %xmm4
-; SSE2-NEXT:  psrlw   $4, %xmm0
-; SSE2-NEXT:  pand    {{.*}}(%rip), %xmm0
-; SSE2-NEXT:  pand    %xmm3, %xmm0
-; SSE2-NEXT:  por     %xmm4, %xmm0
-; SSE2-NEXT:  paddb   %xmm1, %xmm1
-; SSE2-NEXT:  pxor    %xmm3, %xmm3
-; SSE2-NEXT:  pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT:  movdqa  %xmm3, %xmm4
-; SSE2-NEXT:  pandn   %xmm0, %xmm4
-; SSE2-NEXT:  psrlw   $2, %xmm0
-; SSE2-NEXT:  pand    {{.*}}(%rip), %xmm0
-; SSE2-NEXT:  pand    %xmm3, %xmm0
-; SSE2-NEXT:  por     %xmm4, %xmm0
-; SSE2-NEXT:  paddb   %xmm1, %xmm1
-; SSE2-NEXT:  pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT:  movdqa  %xmm2, %xmm1
-; SSE2-NEXT:  pandn   %xmm0, %xmm1
-; SSE2-NEXT:  psrlw   $1, %xmm0
-; SSE2-NEXT:  pand    {{.*}}(%rip), %xmm0
-; SSE2-NEXT:  pand    %xmm2, %xmm0
-; SSE2-NEXT:  por     %xmm1, %xmm0
-; SSE2-NEXT:  retq
+; SSE2-NEXT:    psllw $5, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v16i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa   %xmm0, %xmm2
-; SSE41-NEXT:    psllw    $5, %xmm1
-; SSE41-NEXT:    movdqa   %xmm2, %xmm3
-; SSE41-NEXT:    psrlw    $4, %xmm3
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm3
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $4, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm3, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm3
-; SSE41-NEXT:    psrlw    $2, %xmm3
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm3
-; SSE41-NEXT:    paddb    %xmm1, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $2, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm3, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm3
-; SSE41-NEXT:    psrlw    $1, %xmm3
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm3
-; SSE41-NEXT:    paddb    %xmm1, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlw $1, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm3, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: var_shift_v16i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpsllw    $5, %xmm1, %xmm1
-; AVX-NEXT:    vpsrlw    $4, %xmm0, %xmm2
-; AVX-NEXT:    vpand     {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw    $2, %xmm0, %xmm2
-; AVX-NEXT:    vpand     {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb    %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw    $1, %xmm0, %xmm2
-; AVX-NEXT:    vpand     {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb    %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shift = lshr <16 x i8> %a, %b
@@ -343,10 +333,10 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: splatvar_shift_v8i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movd   %xmm1, %eax
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    movzwl %ax, %eax
-; SSE2-NEXT:    movd   %eax, %xmm1
-; SSE2-NEXT:    psrlw  %xmm1, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    psrlw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: splatvar_shift_v8i16:
@@ -370,99 +360,99 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: splatvar_shift_v16i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:  punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:  pshufd    {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSE2-NEXT:  pshuflw   {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT:  pshufhw   {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
-; SSE2-NEXT:  psllw     $5, %xmm2
-; SSE2-NEXT:  pxor      %xmm1, %xmm1
-; SSE2-NEXT:  pxor      %xmm3, %xmm3
-; SSE2-NEXT:  pcmpgtb   %xmm2, %xmm3
-; SSE2-NEXT:  movdqa    %xmm3, %xmm4
-; SSE2-NEXT:  pandn     %xmm0, %xmm4
-; SSE2-NEXT:  psrlw     $4, %xmm0
-; SSE2-NEXT:  pand      {{.*}}(%rip), %xmm0
-; SSE2-NEXT:  pand      %xmm3, %xmm0
-; SSE2-NEXT:  por       %xmm4, %xmm0
-; SSE2-NEXT:  paddb     %xmm2, %xmm2
-; SSE2-NEXT:  pxor      %xmm3, %xmm3
-; SSE2-NEXT:  pcmpgtb   %xmm2, %xmm3
-; SSE2-NEXT:  movdqa    %xmm3, %xmm4
-; SSE2-NEXT:  pandn     %xmm0, %xmm4
-; SSE2-NEXT:  psrlw     $2, %xmm0
-; SSE2-NEXT:  pand      {{.*}}(%rip), %xmm0
-; SSE2-NEXT:  pand      %xmm3, %xmm0
-; SSE2-NEXT:  por       %xmm4, %xmm0
-; SSE2-NEXT:  paddb     %xmm2, %xmm2
-; SSE2-NEXT:  pcmpgtb   %xmm2, %xmm1
-; SSE2-NEXT:  movdqa    %xmm1, %xmm2
-; SSE2-NEXT:  pandn     %xmm0, %xmm2
-; SSE2-NEXT:  psrlw     $1, %xmm0
-; SSE2-NEXT:  pand      {{.*}}(%rip), %xmm0
-; SSE2-NEXT:  pand      %xmm1, %xmm0
-; SSE2-NEXT:  por       %xmm2, %xmm0
-; SSE2-NEXT:  retq
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    psllw $5, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: splatvar_shift_v16i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa   %xmm0, %xmm2
-; SSE41-NEXT:    pxor     %xmm0, %xmm0
-; SSE41-NEXT:    pshufb   %xmm0, %xmm1
-; SSE41-NEXT:    psllw    $5, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm3
-; SSE41-NEXT:    paddb    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm2, %xmm4
-; SSE41-NEXT:    psrlw    $4, %xmm4
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm4
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm1
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    paddb %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psrlw $4, %xmm4
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm4, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    psrlw    $2, %xmm1
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm1
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $2, %xmm1
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    psrlw    $1, %xmm1
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm1
-; SSE41-NEXT:    paddb    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $1, %xmm1
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    paddb %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: splatvar_shift_v16i8:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor     %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb   %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllw    $5, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb    %xmm1, %xmm1, %xmm2
-; AVX1-NEXT:    vpsrlw    $4, %xmm0, %xmm3
-; AVX1-NEXT:    vpand     {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw    $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpand     {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw    $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpand     {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb    %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_shift_v16i8:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT:    vpsllw       $5, %xmm1, %xmm1
-; AVX2-NEXT:    vpsrlw       $4, %xmm0, %xmm2
-; AVX2-NEXT:    vpand        {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vpblendvb    %xmm1, %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrlw       $2, %xmm0, %xmm2
-; AVX2-NEXT:    vpand        {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vpaddb       %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendvb    %xmm1, %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrlw       $1, %xmm0, %xmm2
-; AVX2-NEXT:    vpand        {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vpaddb       %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendvb    %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = lshr <16 x i8> %a, %splat
@@ -477,24 +467,24 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; SSE2-LABEL: constant_shift_v2i64:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlq  $7, %xmm1
-; SSE2-NEXT:    psrlq  $1, %xmm0
-; SSE2-NEXT:    movsd  {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    psrlq $7, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE2-NEXT:    movapd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v2i64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa  %xmm0, %xmm1
-; SSE41-NEXT:    psrlq   $7, %xmm1
-; SSE41-NEXT:    psrlq   $1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlq $7, %xmm1
+; SSE41-NEXT:    psrlq $1, %xmm0
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v2i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpsrlq  $7, %xmm0, %xmm1
-; AVX1-NEXT:    vpsrlq  $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
@@ -509,59 +499,44 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; SSE2-LABEL: constant_shift_v4i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd      %xmm1, %eax
-; SSE2-NEXT:    shrl      $7, %eax
-; SSE2-NEXT:    movd      %eax, %xmm1
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd      %xmm2, %eax
-; SSE2-NEXT:    shrl      $5, %eax
-; SSE2-NEXT:    movd      %eax, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movd      %xmm0, %eax
-; SSE2-NEXT:    shrl      $4, %eax
-; SSE2-NEXT:    movd      %eax, %xmm1
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd      %xmm0, %eax
-; SSE2-NEXT:    shrl      $6, %eax
-; SSE2-NEXT:    movd      %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movdqa    %xmm1, %xmm0
-; SSE2-NEXT:    retq
+; SSE2-NEXT:  movdqa  %xmm0, %xmm1
+; SSE2-NEXT:  psrld   $7, %xmm1
+; SSE2-NEXT:  movdqa  %xmm0, %xmm2
+; SSE2-NEXT:  psrld   $5, %xmm2
+; SSE2-NEXT:  movsd   {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:  pshufd  {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT:  movdqa  %xmm0, %xmm2
+; SSE2-NEXT:  psrld   $6, %xmm2
+; SSE2-NEXT:  psrld   $4, %xmm0
+; SSE2-NEXT:  movsd   {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT:  pshufd  {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:  punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:  retq
 ;
 ; SSE41-LABEL: constant_shift_v4i32:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrd $1, %xmm0, %eax
-; SSE41-NEXT:    shrl   $5, %eax
-; SSE41-NEXT:    movd   %xmm0, %ecx
-; SSE41-NEXT:    shrl   $4, %ecx
-; SSE41-NEXT:    movd   %ecx, %xmm1
-; SSE41-NEXT:    pinsrd $1, %eax, %xmm1
-; SSE41-NEXT:    pextrd $2, %xmm0, %eax
-; SSE41-NEXT:    shrl   $6, %eax
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm1
-; SSE41-NEXT:    pextrd $3, %xmm0, %eax
-; SSE41-NEXT:    shrl   $7, %eax
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE41:    # BB#0:
+; SSE41-NEXT:  movdqa %xmm0, %xmm1
+; SSE41-NEXT:  psrld  $7, %xmm1
+; SSE41-NEXT:  movdqa %xmm0, %xmm2
+; SSE41-NEXT:  psrld  $5, %xmm2
+; SSE41-NEXT:  pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:  movdqa %xmm0, %xmm1
+; SSE41-NEXT:  psrld  $6, %xmm1
+; SSE41-NEXT:  psrld  $4, %xmm0
+; SSE41-NEXT:  pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:  pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:  retq
 ;
 ; AVX1-LABEL: constant_shift_v4i32:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    shrl    $5, %eax
-; AVX1-NEXT:    vmovd   %xmm0, %ecx
-; AVX1-NEXT:    shrl    $4, %ecx
-; AVX1-NEXT:    vmovd   %ecx, %xmm1
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    shrl    $6, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    shrl    $7, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
-; AVX1-NEXT:    retq
+; AVX1:    # BB#0:
+; AVX1-NEXT:  vpsrld  $7, %xmm0, %xmm1
+; AVX1-NEXT:  vpsrld  $5, %xmm0, %xmm2
+; AVX1-NEXT:  vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:  vpsrld  $6, %xmm0, %xmm2
+; AVX1-NEXT:  vpsrld  $4, %xmm0, %xmm0
+; AVX1-NEXT:  vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:  vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:  retq
 ;
 ; AVX2-LABEL: constant_shift_v4i32:
 ; AVX2:       # BB#0:
@@ -574,56 +549,56 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; SSE2-LABEL: constant_shift_v8i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa    %xmm0, %xmm1
-; SSE2-NEXT:    psrlw     $4, %xmm1
-; SSE2-NEXT:    movsd     {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; SSE2-NEXT:    psrlw     $2, %xmm1
-; SSE2-NEXT:    pshufd    {{.*#+}}  xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; SSE2-NEXT:    psrlw $2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    movdqa    {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT:    movdqa    %xmm2, %xmm1
-; SSE2-NEXT:    pand      %xmm0, %xmm1
-; SSE2-NEXT:    psrlw     $1, %xmm2
-; SSE2-NEXT:    pandn     %xmm2, %xmm0
-; SSE2-NEXT:    por       %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    pandn %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa   %xmm0, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psrlw    $8, %xmm2
-; SSE41-NEXT:    movaps   {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psrlw    $4, %xmm2
-; SSE41-NEXT:    movaps   {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $4, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psrlw    $2, %xmm2
-; SSE41-NEXT:    movaps   {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $2, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psrlw    $1, %xmm2
-; SSE41-NEXT:    movaps   {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $1, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v8i16:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpsrlw    $8, %xmm0, %xmm1
-; AVX1-NEXT:    vmovdqa   {{.*}}(%rip), %xmm2  # xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw    $4, %xmm0, %xmm1
-; AVX1-NEXT:    vmovdqa   {{.*}}(%rip), %xmm2  # xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw    $2, %xmm0, %xmm1
-; AVX1-NEXT:    vmovdqa   {{.*}}(%rip), %xmm2  # xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw    $1, %xmm0, %xmm1
-; AVX1-NEXT:    vmovdqa   {{.*}}(%rip), %xmm2  # xmm2 = [0,32896,256,33152,512,33408,768,33664]
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664]
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -631,9 +606,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpsrlvd   %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb   {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpermq    {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
@@ -643,72 +618,72 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; SSE2-LABEL: constant_shift_v16i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa  {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; SSE2-NEXT:    psllw   $5, %xmm2
-; SSE2-NEXT:    pxor    %xmm1, %xmm1
-; SSE2-NEXT:    pxor    %xmm3, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE2-NEXT:    psllw $5, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT:    movdqa  %xmm3, %xmm4
-; SSE2-NEXT:    pandn   %xmm0, %xmm4
-; SSE2-NEXT:    psrlw   $4, %xmm0
-; SSE2-NEXT:    pand    {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    pand    %xmm3, %xmm0
-; SSE2-NEXT:    por     %xmm4, %xmm0
-; SSE2-NEXT:    paddb   %xmm2, %xmm2
-; SSE2-NEXT:    pxor    %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT:    movdqa  %xmm3, %xmm4
-; SSE2-NEXT:    pandn   %xmm0, %xmm4
-; SSE2-NEXT:    psrlw   $2, %xmm0
-; SSE2-NEXT:    pand    {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    pand    %xmm3, %xmm0
-; SSE2-NEXT:    por     %xmm4, %xmm0
-; SSE2-NEXT:    paddb   %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm2
 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa  %xmm1, %xmm2
-; SSE2-NEXT:    pandn   %xmm0, %xmm2
-; SSE2-NEXT:    psrlw   $1, %xmm0
-; SSE2-NEXT:    pand    {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    pand    %xmm1, %xmm0
-; SSE2-NEXT:    por     %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v16i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa   %xmm0, %xmm1
-; SSE41-NEXT:    movdqa   {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; SSE41-NEXT:    psllw    $5, %xmm0
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psrlw    $4, %xmm2
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE41-NEXT:    psllw $5, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $4, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psrlw    $2, %xmm2
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm2
-; SSE41-NEXT:    paddb    %xmm0, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $2, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    paddb %xmm0, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psrlw    $1, %xmm2
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm2
-; SSE41-NEXT:    paddb    %xmm0, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $1, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    paddb %xmm0, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: constant_shift_v16i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovdqa   {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX-NEXT:    vpsllw    $5, %xmm1, %xmm1
-; AVX-NEXT:    vpsrlw    $4, %xmm0, %xmm2
-; AVX-NEXT:    vpand     {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw    $2, %xmm0, %xmm2
-; AVX-NEXT:    vpand     {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb    %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw    $1, %xmm0, %xmm2
-; AVX-NEXT:    vpand     {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb    %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
@@ -764,14 +739,14 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; SSE-LABEL: splatconstant_shift_v16i8:
 ; SSE:       # BB#0:
-; SSE-NEXT:    psrlw     $3, %xmm0
-; SSE-NEXT:    pand      {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psrlw $3, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: splatconstant_shift_v16i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpsrlw    $3, %xmm0
-; AVX-NEXT:    vpand     {{.*}}(%rip), %xmm0
+; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index d200abd5f875..bb0cceed7720 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -33,39 +33,30 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: var_shift_v8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpextrd $1, %xmm2, %eax
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpextrd $1, %xmm3, %ecx
-; AVX1-NEXT:    shrl %cl, %eax
-; AVX1-NEXT:    vmovd %xmm2, %edx
-; AVX1-NEXT:    vmovd %xmm3, %ecx
-; AVX1-NEXT:    shrl %cl, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm4
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm4, %xmm4
-; AVX1-NEXT:    vpextrd $2, %xmm2, %eax
-; AVX1-NEXT:    vpextrd $2, %xmm3, %ecx
-; AVX1-NEXT:    shrl %cl, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm4, %xmm4
-; AVX1-NEXT:    vpextrd $3, %xmm2, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm3, %ecx
-; AVX1-NEXT:    shrl %cl, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm2
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT:    shrl %cl, %eax
-; AVX1-NEXT:    vmovd %xmm0, %edx
-; AVX1-NEXT:    vmovd %xmm1, %ecx
-; AVX1-NEXT:    shrl %cl, %edx
-; AVX1-NEXT:    vmovd %edx, %xmm3
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT:    shrl %cl, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT:    shrl %cl, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
+; AVX1-NEXT:    vpsrld %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX1-NEXT:    vpsrld %xmm6, %xmm2, %xmm6
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -167,17 +158,17 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ;
 ; AVX2-LABEL: var_shift_v32i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpsllw    $5, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlw    $4, %ymm0, %ymm2
-; AVX2-NEXT:    vpand     {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlw    $2, %ymm0, %ymm2
-; AVX2-NEXT:    vpand     {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT:    vpaddb    %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlw    $1, %ymm0, %ymm2
-; AVX2-NEXT:    vpand     {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT:    vpaddb    %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shift = lshr <32 x i8> %a, %b
@@ -334,32 +325,20 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; AVX1-LABEL: constant_shift_v8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrd $1, %xmm1, %eax
-; AVX1-NEXT:    shrl $9, %eax
-; AVX1-NEXT:    vmovd %xmm1, %ecx
-; AVX1-NEXT:    shrl $8, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm1, %eax
-; AVX1-NEXT:    shrl $8, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
-; AVX1-NEXT:    shrl $7, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    shrl $5, %eax
-; AVX1-NEXT:    vmovd %xmm0, %ecx
-; AVX1-NEXT:    shrl $4, %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    shrl $6, %eax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    shrl $7, %eax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrld $9, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpsrld $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v8i32:
@@ -453,18 +432,18 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ;
 ; AVX2-LABEL: constant_shift_v32i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa   {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX2-NEXT:    vpsllw    $5, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlw    $4, %ymm0, %ymm2
-; AVX2-NEXT:    vpand     {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlw    $2, %ymm0, %ymm2
-; AVX2-NEXT:    vpand     {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT:    vpaddb    %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlw    $1, %ymm0, %ymm2
-; AVX2-NEXT:    vpand     {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT:    vpaddb    %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
@@ -540,8 +519,8 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ;
 ; AVX2-LABEL: splatconstant_shift_v32i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpsrlw $3, %ymm0
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0
+; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 3ac31ea63676..6dbd9eab2a72 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -12,26 +12,26 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psllq  %xmm3, %xmm2
-; SSE2-NEXT:    psllq  %xmm1, %xmm0
-; SSE2-NEXT:    movsd  {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT:    psllq %xmm3, %xmm2
+; SSE2-NEXT:    psllq %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
 ; SSE2-NEXT:    movapd %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v2i64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa  %xmm0, %xmm2
-; SSE41-NEXT:    psllq   %xmm1, %xmm2
-; SSE41-NEXT:    pshufd  {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    psllq   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    psllq %xmm1, %xmm0
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v2i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpsllq   %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpshufd  {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpsllq   %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
@@ -46,33 +46,33 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: var_shift_v4i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pslld     $23, %xmm1
-; SSE2-NEXT:    paddd     {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pmuludq   %xmm0, %xmm1
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq   %xmm2, %xmm0
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movdqa    %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v4i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pslld     $23, %xmm1
-; SSE41-NEXT:    paddd     {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    pslld $23, %xmm1
+; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT:    pmulld    %xmm1, %xmm0
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v4i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpslld     $23, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd     {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT:    vpmulld    %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shift_v4i32:
@@ -86,84 +86,84 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: var_shift_v8i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    psllw  $12, %xmm1
+; SSE2-NEXT:    psllw $12, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psraw  $15, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pandn  %xmm0, %xmm3
-; SSE2-NEXT:    psllw  $8, %xmm0
-; SSE2-NEXT:    pand   %xmm2, %xmm0
-; SSE2-NEXT:    por    %xmm3, %xmm0
-; SSE2-NEXT:    paddw  %xmm1, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psraw  $15, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pandn  %xmm0, %xmm3
-; SSE2-NEXT:    psllw  $4, %xmm0
-; SSE2-NEXT:    pand   %xmm2, %xmm0
-; SSE2-NEXT:    por    %xmm3, %xmm0
-; SSE2-NEXT:    paddw  %xmm1, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psraw  $15, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pandn  %xmm0, %xmm3
-; SSE2-NEXT:    psllw  $2, %xmm0
-; SSE2-NEXT:    pand   %xmm2, %xmm0
-; SSE2-NEXT:    por    %xmm3, %xmm0
-; SSE2-NEXT:    paddw  %xmm1, %xmm1
-; SSE2-NEXT:    psraw  $15, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psllw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pandn  %xmm0, %xmm2
-; SSE2-NEXT:    psllw  $1, %xmm0
-; SSE2-NEXT:    pand   %xmm1, %xmm0
-; SSE2-NEXT:    por    %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psllw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa   %xmm0, %xmm2
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
-; SSE41-NEXT:    psllw    $12, %xmm0
-; SSE41-NEXT:    psllw    $4, %xmm1
-; SSE41-NEXT:    por      %xmm0, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm3
-; SSE41-NEXT:    paddw    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm2, %xmm4
-; SSE41-NEXT:    psllw    $8, %xmm4
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psllw $8, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm4, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    psllw    $4, %xmm1
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    psllw    $2, %xmm1
-; SSE41-NEXT:    paddw    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psllw $2, %xmm1
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    psllw    $1, %xmm1
-; SSE41-NEXT:    paddw    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psllw $1, %xmm1
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v8i16:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpsllw    $12, %xmm1, %xmm2
-; AVX1-NEXT:    vpsllw    $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpor      %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddw    %xmm1, %xmm1, %xmm2
-; AVX1-NEXT:    vpsllw    $8, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllw    $4, %xmm0, %xmm1
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllw    $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw    %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllw    $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw    %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -171,9 +171,9 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpsllvd   %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb   {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpermq    {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
   %shift = shl <8 x i16> %a, %b
@@ -183,69 +183,69 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: var_shift_v16i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:  psllw   $5, %xmm1
-; SSE2-NEXT:  pxor    %xmm2, %xmm2
-; SSE2-NEXT:  pxor    %xmm3, %xmm3
-; SSE2-NEXT:  pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT:  movdqa  %xmm3, %xmm4
-; SSE2-NEXT:  pandn   %xmm0, %xmm4
-; SSE2-NEXT:  psllw   $4, %xmm0
-; SSE2-NEXT:  pand    {{.*}}(%rip), %xmm0
-; SSE2-NEXT:  pand    %xmm3, %xmm0
-; SSE2-NEXT:  por     %xmm4, %xmm0
-; SSE2-NEXT:  paddb   %xmm1, %xmm1
-; SSE2-NEXT:  pxor    %xmm3, %xmm3
-; SSE2-NEXT:  pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT:  movdqa  %xmm3, %xmm4
-; SSE2-NEXT:  pandn   %xmm0, %xmm4
-; SSE2-NEXT:  psllw   $2, %xmm0
-; SSE2-NEXT:  pand    {{.*}}(%rip), %xmm0
-; SSE2-NEXT:  pand    %xmm3, %xmm0
-; SSE2-NEXT:  por     %xmm4, %xmm0
-; SSE2-NEXT:  paddb   %xmm1, %xmm1
-; SSE2-NEXT:  pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT:  movdqa  %xmm2, %xmm1
-; SSE2-NEXT:  pandn   %xmm0, %xmm1
-; SSE2-NEXT:  paddb   %xmm0, %xmm0
-; SSE2-NEXT:  pand    %xmm2, %xmm0
-; SSE2-NEXT:  por     %xmm1, %xmm0
-; SSE2-NEXT:  retq
+; SSE2-NEXT:    psllw $5, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psllw $2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shift_v16i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa   %xmm0, %xmm2
-; SSE41-NEXT:    psllw    $5, %xmm1
-; SSE41-NEXT:    movdqa   %xmm2, %xmm3
-; SSE41-NEXT:    psllw    $4, %xmm3
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm3
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psllw $4, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm3, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm3
-; SSE41-NEXT:    psllw    $2, %xmm3
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm3
-; SSE41-NEXT:    paddb    %xmm1, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psllw $2, %xmm3
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm3, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm3
-; SSE41-NEXT:    paddb    %xmm3, %xmm3
-; SSE41-NEXT:    paddb    %xmm1, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddb %xmm3, %xmm3
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm3, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: var_shift_v16i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpsllw    $5, %xmm1, %xmm1
-; AVX-NEXT:    vpsllw    $4, %xmm0, %xmm2
-; AVX-NEXT:    vpand     {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsllw    $2, %xmm0, %xmm2
-; AVX-NEXT:    vpand     {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb    %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpaddb    %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vpaddb    %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shift = shl <16 x i8> %a, %b
@@ -300,10 +300,10 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: splatvar_shift_v8i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movd   %xmm1, %eax
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    movzwl %ax, %eax
-; SSE2-NEXT:    movd   %eax, %xmm1
-; SSE2-NEXT:    psllw  %xmm1, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    psllw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: splatvar_shift_v8i16:
@@ -327,95 +327,95 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: splatvar_shift_v16i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:  punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:  pshufd    {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSE2-NEXT:  pshuflw   {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT:  pshufhw   {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
-; SSE2-NEXT:  psllw     $5, %xmm2
-; SSE2-NEXT:  pxor      %xmm1, %xmm1
-; SSE2-NEXT:  pxor      %xmm3, %xmm3
-; SSE2-NEXT:  pcmpgtb   %xmm2, %xmm3
-; SSE2-NEXT:  movdqa    %xmm3, %xmm4
-; SSE2-NEXT:  pandn     %xmm0, %xmm4
-; SSE2-NEXT:  psllw     $4, %xmm0
-; SSE2-NEXT:  pand      {{.*}}(%rip), %xmm0
-; SSE2-NEXT:  pand      %xmm3, %xmm0
-; SSE2-NEXT:  por       %xmm4, %xmm0
-; SSE2-NEXT:  paddb     %xmm2, %xmm2
-; SSE2-NEXT:  pxor      %xmm3, %xmm3
-; SSE2-NEXT:  pcmpgtb   %xmm2, %xmm3
-; SSE2-NEXT:  movdqa    %xmm3, %xmm4
-; SSE2-NEXT:  pandn     %xmm0, %xmm4
-; SSE2-NEXT:  psllw     $2, %xmm0
-; SSE2-NEXT:  pand      {{.*}}(%rip), %xmm0
-; SSE2-NEXT:  pand      %xmm3, %xmm0
-; SSE2-NEXT:  por       %xmm4, %xmm0
-; SSE2-NEXT:  paddb     %xmm2, %xmm2
-; SSE2-NEXT:  pcmpgtb   %xmm2, %xmm1
-; SSE2-NEXT:  movdqa    %xmm1, %xmm2
-; SSE2-NEXT:  pandn     %xmm0, %xmm2
-; SSE2-NEXT:  paddb     %xmm0, %xmm0
-; SSE2-NEXT:  pand      %xmm1, %xmm0
-; SSE2-NEXT:  por       %xmm2, %xmm0
-; SSE2-NEXT:  retq
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    psllw $5, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psllw $2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    paddb %xmm0, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: splatvar_shift_v16i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa   %xmm0, %xmm2
-; SSE41-NEXT:    pxor     %xmm0, %xmm0
-; SSE41-NEXT:    pshufb   %xmm0, %xmm1
-; SSE41-NEXT:    psllw    $5, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm3
-; SSE41-NEXT:    paddb    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm2, %xmm4
-; SSE41-NEXT:    psllw    $4, %xmm4
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm4
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm1
+; SSE41-NEXT:    psllw $5, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    paddb %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psllw $4, %xmm4
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm4, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    psllw    $2, %xmm1
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm1
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psllw $2, %xmm1
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm1
-; SSE41-NEXT:    paddb    %xmm1, %xmm1
-; SSE41-NEXT:    paddb    %xmm3, %xmm3
-; SSE41-NEXT:    movdqa   %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    paddb %xmm1, %xmm1
+; SSE41-NEXT:    paddb %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa   %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: splatvar_shift_v16i8:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor     %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb   %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllw    $5, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb    %xmm1, %xmm1, %xmm2
-; AVX1-NEXT:    vpsllw    $4, %xmm0, %xmm3
-; AVX1-NEXT:    vpand     {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllw    $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpand     {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm1
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb    %xmm0, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddb    %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_shift_v16i8:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT:    vpsllw       $5, %xmm1, %xmm1
-; AVX2-NEXT:    vpsllw       $4, %xmm0, %xmm2
-; AVX2-NEXT:    vpand        {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vpblendvb    %xmm1, %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpsllw       $2, %xmm0, %xmm2
-; AVX2-NEXT:    vpand        {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vpaddb       %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendvb    %xmm1, %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpaddb       %xmm0, %xmm0, %xmm2
-; AVX2-NEXT:    vpaddb       %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendvb    %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw $2, %xmm0, %xmm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = shl <16 x i8> %a, %splat
@@ -430,24 +430,24 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; SSE2-LABEL: constant_shift_v2i64:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psllq  $7, %xmm1
-; SSE2-NEXT:    psllq  $1, %xmm0
-; SSE2-NEXT:    movsd  {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT:    psllq $7, %xmm1
+; SSE2-NEXT:    psllq $1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE2-NEXT:    movapd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v2i64:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa  %xmm0, %xmm1
-; SSE41-NEXT:    psllq   $7, %xmm1
-; SSE41-NEXT:    psllq   $1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq $7, %xmm1
+; SSE41-NEXT:    psllq $1, %xmm0
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v2i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpsllq  $7, %xmm0, %xmm1
-; AVX1-NEXT:    vpsllq  $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllq $7, %xmm0, %xmm1
+; AVX1-NEXT:    vpsllq $1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
@@ -462,13 +462,13 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; SSE2-LABEL: constant_shift_v4i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa    {{.*#+}} xmm1 = [16,32,64,128]
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq   %xmm1, %xmm0
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pmuludq   %xmm2, %xmm1
-; SSE2-NEXT:    pshufd    {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
@@ -507,69 +507,69 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; SSE2-LABEL: constant_shift_v16i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa  {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; SSE2-NEXT:    psllw   $5, %xmm2
-; SSE2-NEXT:    pxor    %xmm1, %xmm1
-; SSE2-NEXT:    pxor    %xmm3, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE2-NEXT:    psllw $5, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT:    movdqa  %xmm3, %xmm4
-; SSE2-NEXT:    pandn   %xmm0, %xmm4
-; SSE2-NEXT:    psllw   $4, %xmm0
-; SSE2-NEXT:    pand    {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    pand    %xmm3, %xmm0
-; SSE2-NEXT:    por     %xmm4, %xmm0
-; SSE2-NEXT:    paddb   %xmm2, %xmm2
-; SSE2-NEXT:    pxor    %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT:    movdqa  %xmm3, %xmm4
-; SSE2-NEXT:    pandn   %xmm0, %xmm4
-; SSE2-NEXT:    psllw   $2, %xmm0
-; SSE2-NEXT:    pand    {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    pand    %xmm3, %xmm0
-; SSE2-NEXT:    por     %xmm4, %xmm0
-; SSE2-NEXT:    paddb   %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psllw $2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm2
 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa  %xmm1, %xmm2
-; SSE2-NEXT:    pandn   %xmm0, %xmm2
-; SSE2-NEXT:    paddb   %xmm0, %xmm0
-; SSE2-NEXT:    pand    %xmm1, %xmm0
-; SSE2-NEXT:    por     %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    paddb %xmm0, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v16i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movdqa   %xmm0, %xmm1
-; SSE41-NEXT:    movdqa   {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; SSE41-NEXT:    psllw    $5, %xmm0
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psllw    $4, %xmm2
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE41-NEXT:    psllw $5, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psllw $4, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    psllw    $2, %xmm2
-; SSE41-NEXT:    pand     {{.*}}(%rip), %xmm2
-; SSE41-NEXT:    paddb    %xmm0, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psllw $2, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    paddb %xmm0, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm2
-; SSE41-NEXT:    paddb    %xmm2, %xmm2
-; SSE41-NEXT:    paddb    %xmm0, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    paddb %xmm2, %xmm2
+; SSE41-NEXT:    paddb %xmm0, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa   %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: constant_shift_v16i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovdqa   {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX-NEXT:    vpsllw    $5, %xmm1, %xmm1
-; AVX-NEXT:    vpsllw    $4, %xmm0, %xmm2
-; AVX-NEXT:    vpand     {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsllw    $2, %xmm0, %xmm2
-; AVX-NEXT:    vpand     {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb    %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpaddb    %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vpaddb    %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
@@ -625,14 +625,14 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; SSE-LABEL: splatconstant_shift_v16i8:
 ; SSE:       # BB#0:
-; SSE-NEXT:    psllw     $3, %xmm0
-; SSE-NEXT:    pand      {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psllw $3, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: splatconstant_shift_v16i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpsllw    $3, %xmm0
-; AVX-NEXT:    vpand     {{.*}}(%rip), %xmm0
+; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index 7c13c0ae4716..b287875f6541 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -193,7 +193,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2-LABEL: splatvar_shift_v8i32:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -341,7 +341,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*}}(%rip), %xmm4  # xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX1-NEXT:    vpsllw $5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm2
diff --git a/test/CodeGen/X86/vector-shuffle-sse4a.ll b/test/CodeGen/X86/vector-shuffle-sse4a.ll
new file mode 100644
index 000000000000..26062335cc16
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-sse4a.ll
@@ -0,0 +1,221 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER2
+
+;
+; EXTRQI
+;
+
+define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_0zzzuuuuuuuuuuuu:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_0zzzzzzz1zzzzzzz:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    movaps %xmm0, %xmm1
+; BTVER1-NEXT:    extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_0zzzzzzz1zzzzzzz:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_01zzuuuuuuuuuuuu:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) {
+; BTVER1-LABEL: shuf_01zzzzzz23zzzzzz:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    movaps %xmm0, %xmm1
+; BTVER1-NEXT:    extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_01zzzzzz23zzzzzz:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_1zzzuuuuuuuuuuuu(<16 x i8> %a0) {
+; ALL-LABEL: shuf_1zzzuuuuuuuuuuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <8 x i16> @shuf_1zzzuuuu(<8 x i16> %a0) {
+; ALL-LABEL: shuf_1zzzuuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) {
+; ALL-LABEL: shuf_12zzuuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[2,3,4,5],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) {
+; ALL-LABEL: shuf_012zuuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
+; BTVER1-LABEL: shuf_0zzz1zzz:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    movaps %xmm0, %xmm1
+; BTVER1-NEXT:    extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_0zzz1zzz:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8>
+  ret <8 x i16> %s
+}
+
+define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
+; BTVER1-LABEL: shuf_0z1z:
+; BTVER1:       # BB#0:
+; BTVER1-NEXT:    pxor %xmm1, %xmm1
+; BTVER1-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; BTVER1-NEXT:    retq
+;
+; BTVER2-LABEL: shuf_0z1z:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; BTVER2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
+  ret <4 x i32> %s
+}
+
+;
+; INSERTQI
+;
+
+define <16 x i8> @shuf_0_0_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
+; ALL-LABEL: shuf_0_0_2_3_uuuu_uuuu_uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 0, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_0_16_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
+; ALL-LABEL: shuf_0_16_2_3_uuuu_uuuu_uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @shuf_16_1_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
+; ALL-LABEL: shuf_16_1_2_3_uuuu_uuuu_uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <8 x i16> @shuf_0823uuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_0823uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1],xmm0[4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_0183uuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_0183uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[0,1],xmm0[6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_0128uuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_0128uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[0,1],xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_0893uuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_0893uuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_089Auuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_089Auuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3,4,5],xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
+; ALL-LABEL: shuf_089uuuuu:
+; ALL:       # BB#0:
+; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT:    retq
+  %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %s
+}
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index d2eef9af2a25..2480e676cad0 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -223,15 +223,15 @@ entry:
 }
 
 define <16 x i8> @trunc16i64_const() {
-; SSE-LABEL:  trunc16i64_const
-; SSE:        # BB#0: # %entry
-; SSE-NEXT:   xorps %xmm0, %xmm0
-; SSE-NEXT:   retq
-;
-; AVX-LABEL:  trunc16i64_const
-; AVX:        # BB#0: # %entry
-; AVX-NEXT:   vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:   retq
+; SSE-LABEL: trunc16i64_const:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc16i64_const:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 
 entry:
   %0 = trunc <16 x i64> zeroinitializer to <16 x i8>
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index c64e17442675..b119f5eb89f6 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -11,7 +11,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pand .LCPI0_0(%rip), %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: zext_8i16_to_8i32:
@@ -20,7 +20,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    pand .LCPI0_0(%rip), %xmm1
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: zext_8i16_to_8i32:
@@ -28,7 +28,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    pand .LCPI0_0(%rip), %xmm1
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: zext_8i16_to_8i32:
@@ -156,7 +156,7 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pand .LCPI3_0(%rip), %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: zext_16i8_to_16i16:
@@ -165,15 +165,15 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT:    pand .LCPI3_0(%rip), %xmm1
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: zext_16i8_to_16i16:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pmovzxbw %xmm1, %xmm0 {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pand .LCPI3_0(%rip), %xmm1
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: zext_16i8_to_16i16:
@@ -195,24 +195,24 @@ entry:
 
 define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
 ; SSE2-LABEL: load_zext_16i8_to_16i16:
-; SSE2:        # BB#0: # %entry
-; SSE2-NEXT:   movdqa (%rdi), %xmm1
-; SSE2-NEXT:   pxor %xmm2, %xmm2
-; SSE2-NEXT:   movdqa %xmm1, %xmm0
-; SSE2-NEXT:   punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT:   punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:   pand .LCPI4_0(%rip), %xmm1
-; SSE2-NEXT:   retq
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_zext_16i8_to_16i16:
-; SSSE3:        # BB#0: # %entry
-; SSSE3-NEXT:   movdqa (%rdi), %xmm1
-; SSSE3-NEXT:   pxor %xmm2, %xmm2
-; SSSE3-NEXT:   movdqa %xmm1, %xmm0
-; SSSE3-NEXT:   punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT:   punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT:   pand .LCPI4_0(%rip), %xmm1
-; SSSE3-NEXT:   retq
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: load_zext_16i8_to_16i16:
 ; SSE41:       # BB#0: # %entry
@@ -239,24 +239,24 @@ entry:
 
 define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
 ; SSE2-LABEL: load_zext_8i16_to_8i32:
-; SSE2:          # BB#0: # %entry
-; SSE2-NEXT:   movdqa (%rdi), %xmm1
-; SSE2-NEXT:   pxor %xmm2, %xmm2
-; SSE2-NEXT:   movdqa %xmm1, %xmm0
-; SSE2-NEXT:   punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:   punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:   pand .LCPI5_0(%rip), %xmm1
-; SSE2-NEXT:   retq
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_zext_8i16_to_8i32:
-; SSSE3:        # BB#0: # %entry
-; SSSE3-NEXT:   movdqa (%rdi), %xmm1
-; SSSE3-NEXT:   pxor %xmm2, %xmm2
-; SSSE3-NEXT:   movdqa %xmm1, %xmm0
-; SSSE3-NEXT:   punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:   punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:   pand .LCPI5_0(%rip), %xmm1
-; SSSE3-NEXT:   retq
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: load_zext_8i16_to_8i32:
 ; SSE41:       # BB#0: # %entry
@@ -415,7 +415,7 @@ entry:
 define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
 ; SSE2-LABEL: shuf_zext_8i8_to_8i32:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    pand .LCPI9_0(%rip), %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
diff --git a/test/CodeGen/X86/vector-zmov.ll b/test/CodeGen/X86/vector-zmov.ll
index cf592b1e9f42..298683559054 100644
--- a/test/CodeGen/X86/vector-zmov.ll
+++ b/test/CodeGen/X86/vector-zmov.ll
@@ -5,15 +5,16 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 
 define <4 x i32> @load_zmov_4i32_to_0zzz(<4 x i32> *%ptr) {
-; SSE-LABEL:  load_zmov_4i32_to_0zzz:
-; SSE:        # BB#0: # %entry
-; SSE-NEXT:   movd (%rdi), %xmm0
-; SSE-NEXT:   retq
+; SSE-LABEL: load_zmov_4i32_to_0zzz:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_zmov_4i32_to_0zzz:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    retq
 
-; AVX-LABEL:  load_zmov_4i32_to_0zzz:
-; AVX:        # BB#0: # %entry
-; AVX-NEXT:   vmovd (%rdi), %xmm0
-; AVX-NEXT:   retq
 entry:
   %X = load <4 x i32>, <4 x i32>* %ptr
   %Y = shufflevector <4 x i32> %X, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
@@ -21,15 +22,16 @@ entry:
 }
 
 define <2 x i64> @load_zmov_2i64_to_0z(<2 x i64> *%ptr) {
-; SSE-LABEL:  load_zmov_2i64_to_0z:
-; SSE:        # BB#0: # %entry
-; SSE-NEXT:   movq (%rdi), %xmm0
-; SSE-NEXT:   retq
+; SSE-LABEL: load_zmov_2i64_to_0z:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_zmov_2i64_to_0z:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    retq
 
-; AVX-LABEL:  load_zmov_2i64_to_0z:
-; AVX:        # BB#0: # %entry
-; AVX-NEXT:   vmovq (%rdi), %xmm0
-; AVX-NEXT:   retq
 entry:
   %X = load <2 x i64>, <2 x i64>* %ptr
   %Y = shufflevector <2 x i64> %X, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
diff --git a/test/CodeGen/X86/visibility.ll b/test/CodeGen/X86/visibility.ll
index 580c3dc9266d..be7fd96f2dd9 100644
--- a/test/CodeGen/X86/visibility.ll
+++ b/test/CodeGen/X86/visibility.ll
@@ -2,13 +2,19 @@
 
 @zed = external hidden constant i32
 
+define available_externally hidden void @baz() {
+  ret void
+}
+
 define hidden void @foo() nounwind {
 entry:
   call void @bar(i32* @zed)
+  call void @baz()
   ret void
 }
 
 declare hidden void @bar(i32*)
 
 ;CHECK: .hidden	zed
+;CHECK: .hidden	baz
 ;CHECK: .hidden	bar
diff --git a/test/CodeGen/X86/vshift-3.ll b/test/CodeGen/X86/vshift-3.ll
index 0bdb32fcb86e..f368029e4b49 100644
--- a/test/CodeGen/X86/vshift-3.ll
+++ b/test/CodeGen/X86/vshift-3.ll
@@ -3,13 +3,12 @@
 ; test vector shifts converted to proper SSE2 vector shifts when the shift
 ; amounts are the same.
 
-; Note that x86 does have ashr 
+; Note that x86 does have ashr
 
-; shift1a can't use a packed shift
 define void @shift1a(<2 x i64> %val, <2 x i64>* %dst) nounwind {
 entry:
 ; CHECK-LABEL: shift1a:
-; CHECK: sarl
+; CHECK: psrad $31
   %ashr = ashr <2 x i64> %val, < i64 32, i64 32 >
   store <2 x i64> %ashr, <2 x i64>* %dst
   ret void
diff --git a/test/CodeGen/X86/webkit-jscc.ll b/test/CodeGen/X86/webkit-jscc.ll
new file mode 100644
index 000000000000..a58c53e024ec
--- /dev/null
+++ b/test/CodeGen/X86/webkit-jscc.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-windows-gnu -mcpu=corei7 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-windows-msvc -mcpu=corei7 < %s | FileCheck %s
+
+define webkit_jscc i32 @simple_jscall(i32 %a, i32 %b, i32 %c) {
+  %ab = add i32 %a, %b
+  %abc = add i32 %ab, %c
+  ret i32 %abc
+}
+
+; 32-bit integers are only aligned to 4 bytes, even on x64. They are *not*
+; promoted to i64.
+
+; CHECK: simple_jscall:
+; CHECK: addl 8(%rsp), %eax
+; CHECK-NEXT: addl 12(%rsp), %eax
+; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/widen_conv-2.ll b/test/CodeGen/X86/widen_conv-2.ll
index 906f7cdafb95..c8646c6489a1 100644
--- a/test/CodeGen/X86/widen_conv-2.ll
+++ b/test/CodeGen/X86/widen_conv-2.ll
@@ -1,8 +1,9 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: {{cwtl|movswl}}
-; CHECK: {{cwtl|movswl}}
+; CHECK: psllq $48, %xmm0
+; CHECK: psrad $16, %xmm0
+; CHECK: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 
-; sign extension v2i32 to v2i16
+; sign extension v2i16 to v2i32
 
 define void @convert(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind {
 entry:
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index f5ddc0eacc61..6f1bd7541231 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -194,17 +194,9 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa
 ; CHECK-NEXT:    movl    (%[[PTR0]]), [[TMP1:%e[abcd]+x]]
 ; CHECK-NEXT:    movl    [[TMP1]], [[TMP2:.*]]
 ; CHECK-NEXT:    pmovzxbd [[TMP2]], %[[X0:xmm[0-9]+]]
-; CHECK-NEXT:    pextrd  $1, %[[X0]], %e[[R0:[abcd]]]x
-; CHECK-NEXT:    shrl    %e[[R0]]x
-; CHECK-NEXT:    movd    %[[X0]], %e[[R1:[abcd]]]x
-; CHECK-NEXT:    shrl    %e[[R1]]x
-; CHECK-NEXT:    movd    %e[[R1]]x, %[[X1:xmm[0-9]+]]
-; CHECK-NEXT:    pinsrd  $1, %e[[R0]]x, %[[X1]]
-; CHECK-NEXT:    pextrd  $2, %[[X0]], %e[[R0:[abcd]]]x
-; CHECK-NEXT:    shrl    %e[[R0]]x
-; CHECK-NEXT:    pinsrd  $2, %e[[R0]]x, %[[X1]]
-; CHECK-NEXT:    pextrd  $3, %[[X0]], %e[[R0:[abcd]]]x
-; CHECK-NEXT:    pinsrd  $3, %e[[R0]]x, %[[X1]]
+; CHECK-NEXT:    movdqa  %[[X0]], %[[X1:xmm[0-9]+]]
+; CHECK-NEXT:    psrld   $1, %[[X1]]
+; CHECK-NEXT:    pblendw $192, %[[X0]], %[[X1]]
 ; CHECK-NEXT:    pextrb  $8, %[[X1]], 2(%{{.*}})
 ; CHECK-NEXT:    pshufb  %[[SHUFFLE_MASK]], %[[X1]]
 ; CHECK-NEXT:    pmovzxwq %[[X1]], %[[X3:xmm[0-9]+]]
diff --git a/test/CodeGen/X86/win32-eh.ll b/test/CodeGen/X86/win32-eh.ll
index f235d2884d03..3ee4723ce5f3 100644
--- a/test/CodeGen/X86/win32-eh.ll
+++ b/test/CodeGen/X86/win32-eh.ll
@@ -32,16 +32,19 @@ eh.resume:
 ; CHECK-LABEL: _use_except_handler3:
 ; CHECK: pushl %ebp
 ; CHECK: movl %esp, %ebp
+; CHECK: pushl %ebx
+; CHECK: pushl %edi
+; CHECK: pushl %esi
 ; CHECK: subl ${{[0-9]+}}, %esp
-; CHECK: movl $-1, -4(%ebp)
-; CHECK: movl $L__ehtable$use_except_handler3, -8(%ebp)
-; CHECK: leal -16(%ebp), %[[node:[^ ,]*]]
-; CHECK: movl $__except_handler3, -12(%ebp)
+; CHECK: movl $-1, -16(%ebp)
+; CHECK: movl $L__ehtable$use_except_handler3, -20(%ebp)
+; CHECK: leal -28(%ebp), %[[node:[^ ,]*]]
+; CHECK: movl $__except_handler3, -24(%ebp)
 ; CHECK: movl %fs:0, %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], -16(%ebp)
+; CHECK: movl %[[next]], -28(%ebp)
 ; CHECK: movl %[[node]], %fs:0
 ; CHECK: calll _may_throw_or_crash
-; CHECK: movl -16(%ebp), %[[next:[^ ,]*]]
+; CHECK: movl -28(%ebp), %[[next:[^ ,]*]]
 ; CHECK: movl %[[next]], %fs:0
 ; CHECK: retl
 
@@ -72,18 +75,18 @@ eh.resume:
 ; CHECK: pushl %ebp
 ; CHECK: movl %esp, %ebp
 ; CHECK: subl ${{[0-9]+}}, %esp
-; CHECK: movl %esp, -24(%ebp)
-; CHECK: movl $-2, -4(%ebp)
+; CHECK: movl %esp, -36(%ebp)
+; CHECK: movl $-2, -16(%ebp)
 ; CHECK: movl $L__ehtable$use_except_handler4, %[[lsda:[^ ,]*]]
 ; CHECK: xorl ___security_cookie, %[[lsda]]
-; CHECK: movl %[[lsda]], -8(%ebp)
-; CHECK: leal -16(%ebp), %[[node:[^ ,]*]]
-; CHECK: movl $__except_handler4, -12(%ebp)
+; CHECK: movl %[[lsda]], -20(%ebp)
+; CHECK: leal -28(%ebp), %[[node:[^ ,]*]]
+; CHECK: movl $__except_handler4, -24(%ebp)
 ; CHECK: movl %fs:0, %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], -16(%ebp)
+; CHECK: movl %[[next]], -28(%ebp)
 ; CHECK: movl %[[node]], %fs:0
 ; CHECK: calll _may_throw_or_crash
-; CHECK: movl -16(%ebp), %[[next:[^ ,]*]]
+; CHECK: movl -28(%ebp), %[[next:[^ ,]*]]
 ; CHECK: movl %[[next]], %fs:0
 ; CHECK: retl
 
@@ -115,20 +118,21 @@ catchall:
 ; CHECK: pushl %ebp
 ; CHECK: movl %esp, %ebp
 ; CHECK: subl ${{[0-9]+}}, %esp
-; CHECK: movl %esp, -16(%ebp)
-; CHECK: movl $-1, -4(%ebp)
-; CHECK: leal -12(%ebp), %[[node:[^ ,]*]]
-; CHECK: movl $___ehhandler$use_CxxFrameHandler3, -8(%ebp)
+; CHECK: movl %esp, -28(%ebp)
+; CHECK: movl $-1, -16(%ebp)
+; CHECK: leal -24(%ebp), %[[node:[^ ,]*]]
+; CHECK: movl $___ehhandler$use_CxxFrameHandler3, -20(%ebp)
 ; CHECK: movl %fs:0, %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], -12(%ebp)
+; CHECK: movl %[[next]], -24(%ebp)
 ; CHECK: movl %[[node]], %fs:0
-; CHECK: movl $0, -4(%ebp)
+; CHECK: movl $0, -16(%ebp)
 ; CHECK: calll _may_throw_or_crash
-; CHECK: movl -12(%ebp), %[[next:[^ ,]*]]
+; CHECK: movl -24(%ebp), %[[next:[^ ,]*]]
 ; CHECK: movl %[[next]], %fs:0
 ; CHECK: retl
 
 ; CHECK: .section .xdata,"dr"
+; CHECK: .align 4
 ; CHECK-LABEL: L__ehtable$use_CxxFrameHandler3:
 ; CHECK-NEXT:  .long   429065506
 ; CHECK-NEXT:  .long   2
diff --git a/test/CodeGen/X86/win64_frame.ll b/test/CodeGen/X86/win64_frame.ll
index 2c62f4918a7f..477b3144d9e7 100644
--- a/test/CodeGen/X86/win64_frame.ll
+++ b/test/CodeGen/X86/win64_frame.ll
@@ -100,8 +100,9 @@ define i32 @f8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) "no-frame-pointer-elim"="
 
   alloca i32, i32 %a
   ; CHECK:        movl    %ecx, %eax
-  ; CHECK:        leaq    15(,%rax,4), %rax
-  ; CHECK:        andq    $-16, %rax
+  ; CHECK:        leaq    15(,%rax,4), %rcx
+  ; CHECK:        movabsq $34359738352, %rax
+  ; CHECK:        andq    %rcx, %rax
   ; CHECK:        callq   __chkstk
   ; CHECK:        subq    %rax, %rsp
 
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 5848eddf4375..8c91335d91a2 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -598,3 +598,42 @@ if.then.60:                                       ; preds = %if.end.55
 cleanup:                                          ; preds = %if.then.60, %if.end.55, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %if.end, %entry
   ret void
 }
+
+; Make sure we do not insert unreachable code after noreturn function.
+; Although this is not incorrect to insert such code, it is useless
+; and it hurts the binary size.
+;
+; CHECK-LABEL: noreturn:
+; DISABLE: pushq
+;
+; CHECK: testb   %dil, %dil
+; CHECK-NEXT: jne      [[ABORT:LBB[0-9_]+]]
+;
+; CHECK: movl $42, %eax
+;
+; DISABLE-NEXT: popq
+;
+; CHECK-NEXT: retq
+;
+; CHECK: [[ABORT]]: ## %if.abort
+;
+; ENABLE: pushq
+;
+; CHECK: callq _abort
+; ENABLE-NOT: popq
+define i32 @noreturn(i8 signext %bad_thing) {
+entry:
+  %tobool = icmp eq i8 %bad_thing, 0
+  br i1 %tobool, label %if.end, label %if.abort
+
+if.abort:
+  tail call void @abort() #0
+  unreachable
+
+if.end:
+  ret i32 42
+}
+
+declare void @abort() #0
+
+attributes #0 = { noreturn nounwind }
diff --git a/test/DebugInfo/COFF/asm.ll b/test/DebugInfo/COFF/asm.ll
index 55a50002b8b9..bc2a11d066b5 100644
--- a/test/DebugInfo/COFF/asm.ll
+++ b/test/DebugInfo/COFF/asm.ll
@@ -51,7 +51,7 @@
 ; X86-NEXT: [[F2_START]]:
 ; X86-NEXT: .secrel32 _f
 ; X86-NEXT: .secidx _f
-; X86-NEXT: .short 0
+; X86-NEXT: .short 1
 ; X86-NEXT: .long [[END_OF_F]]-_f
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X86-NEXT: .long   0
@@ -63,6 +63,12 @@
 ; X86-NEXT: .long   5
 ; X86-NEXT: .long [[RETURN_STMT]]-_f
 ; X86-NEXT: .long   6
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
 ; X86-NEXT: [[FILE_SEGMENT_END]]:
 ; X86-NEXT: [[F2_END]]:
 ; File index to string table offset subsection
@@ -101,6 +107,7 @@
 ; OBJ32-NEXT: ]
 ; OBJ32:      FunctionLineTable [
 ; OBJ32-NEXT:   Name: _f
+; OBJ32-NEXT:   Flags: 0x1
 ; OBJ32-NEXT:   CodeSize: 0x6
 ; OBJ32-NEXT:   FilenameSegment [
 ; OBJ32-NEXT:     Filename: D:\asm.c
@@ -110,6 +117,12 @@
 ; OBJ32-NEXT:     +0x0: 4
 ; OBJ32-NEXT:     +0x0: 5
 ; OBJ32-NEXT:     +0x5: 6
+; OBJ32-NEXT:     ColStart: 0
+; OBJ32-NEXT:     ColEnd: 0
+; OBJ32-NEXT:     ColStart: 0
+; OBJ32-NEXT:     ColEnd: 0
+; OBJ32-NEXT:     ColStart: 0
+; OBJ32-NEXT:     ColEnd: 0
 ; OBJ32-NEXT:   ]
 ; OBJ32-NEXT: ]
 ; OBJ32:    }
@@ -157,7 +170,7 @@
 ; X64-NEXT: [[F2_START]]:
 ; X64-NEXT: .secrel32 f
 ; X64-NEXT: .secidx f
-; X64-NEXT: .short 0
+; X64-NEXT: .short 1
 ; X64-NEXT: .long [[END_OF_F]]-f
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X64-NEXT: .long   0
@@ -171,6 +184,14 @@
 ; X64-NEXT: .long   5
 ; X64-NEXT: .long [[EPILOG_AND_RET]]-f
 ; X64-NEXT: .long   6
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
 ; X64-NEXT: [[FILE_SEGMENT_END]]:
 ; X64-NEXT: [[F2_END]]:
 ; File index to string table offset subsection
@@ -209,6 +230,7 @@
 ; OBJ64-NEXT: ]
 ; OBJ64:      FunctionLineTable [
 ; OBJ64-NEXT:   Name: f
+; OBJ64-NEXT:   Flags: 0x1
 ; OBJ64-NEXT:   CodeSize: 0xE
 ; OBJ64-NEXT:   FilenameSegment [
 ; OBJ64-NEXT:     Filename: D:\asm.c
@@ -218,6 +240,14 @@
 ; OBJ64-NEXT:     +0x4: 4
 ; OBJ64-NEXT:     +0x4: 5
 ; OBJ64-NEXT:     +0x9: 6
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
 ; OBJ64-NEXT:   ]
 ; OBJ64-NEXT: ]
 ; OBJ64:    }
diff --git a/test/DebugInfo/COFF/multifile.ll b/test/DebugInfo/COFF/multifile.ll
index 0c9782aceec2..3dedacebc140 100644
--- a/test/DebugInfo/COFF/multifile.ll
+++ b/test/DebugInfo/COFF/multifile.ll
@@ -58,7 +58,7 @@
 ; X86-NEXT: [[F2_START]]:
 ; X86-NEXT: .secrel32 _f
 ; X86-NEXT: .secidx _f
-; X86-NEXT: .short 0
+; X86-NEXT: .short 1
 ; X86-NEXT: .long [[END_OF_F]]-_f
 ; Segment for file 'D:\\one.c' begins
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
@@ -67,6 +67,8 @@
 ; X86-NEXT: .long [[FILE_SEGMENT_END:.*]]-[[FILE_SEGMENT_START]]
 ; X86-NEXT: .long [[CALL_LINE_1]]-_f
 ; X86-NEXT: .long   1
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
 ; X86-NEXT: [[FILE_SEGMENT_END]]:
 ; Segment for file 'D:\\two.c' begins
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
@@ -75,6 +77,8 @@
 ; X86-NEXT: .long [[FILE_SEGMENT_END:.*]]-[[FILE_SEGMENT_START]]
 ; X86-NEXT: .long [[CALL_LINE_2]]-_f
 ; X86-NEXT: .long   2
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
 ; X86-NEXT: [[FILE_SEGMENT_END]]:
 ; A new segment for file 'D:\\one.c' begins
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
@@ -85,6 +89,10 @@
 ; X86-NEXT: .long   7
 ; X86-NEXT: .long [[RETURN_STMT]]-_f
 ; X86-NEXT: .long   8
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
 ; X86-NEXT: [[FILE_SEGMENT_END]]:
 ; X86-NEXT: [[F2_END]]:
 ; File index to string table offset subsection
@@ -126,19 +134,28 @@
 ; OBJ32-NEXT: ]
 ; OBJ32:      FunctionLineTable [
 ; OBJ32-NEXT:   Name: _f
+; OBJ32-NEXT:   Flags: 0x1
 ; OBJ32-NEXT:   CodeSize: 0x10
 ; OBJ32-NEXT:   FilenameSegment [
 ; OBJ32-NEXT:     Filename: D:\one.c
 ; OBJ32-NEXT:     +0x0: 1
+; OBJ32-NEXT:     ColStart: 0
+; OBJ32-NEXT:     ColEnd: 0
 ; OBJ32-NEXT:   ]
 ; OBJ32-NEXT:   FilenameSegment [
 ; OBJ32-NEXT:     Filename: D:\two.c
 ; OBJ32-NEXT:     +0x5: 2
+; OBJ32-NEXT:     ColStart: 0
+; OBJ32-NEXT:     ColEnd: 0
 ; OBJ32-NEXT:   ]
 ; OBJ32-NEXT:   FilenameSegment [
 ; OBJ32-NEXT:     Filename: D:\one.c
 ; OBJ32-NEXT:     +0xA: 7
 ; OBJ32-NEXT:     +0xF: 8
+; OBJ32-NEXT:     ColStart: 0
+; OBJ32-NEXT:     ColEnd: 0
+; OBJ32-NEXT:     ColStart: 0
+; OBJ32-NEXT:     ColEnd: 0
 ; OBJ32-NEXT:   ]
 ; OBJ32-NEXT: ]
 ; OBJ32:    }
@@ -189,7 +206,7 @@
 ; X64-NEXT: [[F2_START]]:
 ; X64-NEXT: .secrel32 f
 ; X64-NEXT: .secidx f
-; X64-NEXT: .short 0
+; X64-NEXT: .short 1
 ; X64-NEXT: .long [[END_OF_F]]-f
 ; Segment for file 'D:\\input.c' begins
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
@@ -198,6 +215,8 @@
 ; X64-NEXT: .long [[FILE_SEGMENT_END:.*]]-[[FILE_SEGMENT_START]]
 ; X64-NEXT: .long [[START]]-f
 ; X64-NEXT: .long   3
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
 ; X64-NEXT: [[FILE_SEGMENT_END]]:
 ; Segment for file 'D:\\one.c' begins
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
@@ -206,6 +225,8 @@
 ; X64-NEXT: .long [[FILE_SEGMENT_END:.*]]-[[FILE_SEGMENT_START]]
 ; X64-NEXT: .long [[CALL_LINE_1]]-f
 ; X64-NEXT: .long   1
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
 ; X64-NEXT: [[FILE_SEGMENT_END]]:
 ; Segment for file 'D:\\two.c' begins
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
@@ -214,6 +235,8 @@
 ; X64-NEXT: .long [[FILE_SEGMENT_END:.*]]-[[FILE_SEGMENT_START]]
 ; X64-NEXT: .long [[CALL_LINE_2]]-f
 ; X64-NEXT: .long   2
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
 ; X64-NEXT: [[FILE_SEGMENT_END]]:
 ; A new segment for file 'D:\\one.c' begins
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
@@ -224,6 +247,10 @@
 ; X64-NEXT: .long   7
 ; X64-NEXT: .long [[EPILOG_AND_RET]]-f
 ; X64-NEXT: .long   8
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
 ; X64-NEXT: [[FILE_SEGMENT_END]]:
 ; X64-NEXT: [[F2_END]]:
 ; File index to string table offset subsection
@@ -269,23 +296,34 @@
 ; OBJ64-NEXT: ]
 ; OBJ64:      FunctionLineTable [
 ; OBJ64-NEXT:   Name: f
+; OBJ64-NEXT:   Flags: 0x1
 ; OBJ64-NEXT:   CodeSize: 0x18
 ; OBJ64-NEXT:   FilenameSegment [
 ; OBJ64-NEXT:     Filename: D:\input.c
 ; OBJ64-NEXT:     +0x0: 3
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
 ; OBJ64-NEXT:   ]
 ; OBJ64-NEXT:   FilenameSegment [
 ; OBJ64-NEXT:     Filename: D:\one.c
 ; OBJ64-NEXT:     +0x4: 1
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
 ; OBJ64-NEXT:   ]
 ; OBJ64-NEXT:   FilenameSegment [
 ; OBJ64-NEXT:     Filename: D:\two.c
 ; OBJ64-NEXT:     +0x9: 2
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
 ; OBJ64-NEXT:   ]
 ; OBJ64-NEXT:   FilenameSegment [
 ; OBJ64-NEXT:     Filename: D:\one.c
 ; OBJ64-NEXT:     +0xE: 7
 ; OBJ64-NEXT:     +0x13: 8
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
 ; OBJ64-NEXT:   ]
 ; OBJ64-NEXT: ]
 ; OBJ64:    }
diff --git a/test/DebugInfo/COFF/multifunction.ll b/test/DebugInfo/COFF/multifunction.ll
index 53a8115252d0..bbf97dd4afc0 100644
--- a/test/DebugInfo/COFF/multifunction.ll
+++ b/test/DebugInfo/COFF/multifunction.ll
@@ -82,7 +82,7 @@
 ; X86-NEXT: [[F2_START]]:
 ; X86-NEXT: .secrel32       _x
 ; X86-NEXT: .secidx _x
-; X86-NEXT: .short 0
+; X86-NEXT: .short 1
 ; X86-NEXT: .long [[END_OF_X]]-_x
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X86-NEXT: .long   0
@@ -92,6 +92,10 @@
 ; X86-NEXT: .long   4
 ; X86-NEXT: .long [[X_RETURN]]-_x
 ; X86-NEXT: .long   5
+; X86-NEXT: .short 42
+; X86-NEXT: .short 42
+; X86-NEXT: .short 43
+; X86-NEXT: .short 43
 ; X86-NEXT: [[FILE_SEGMENT_END]]:
 ; X86-NEXT: [[F2_END]]:
 ; Symbol subsection for y
@@ -121,7 +125,7 @@
 ; X86-NEXT: [[F2_START]]:
 ; X86-NEXT: .secrel32       _y
 ; X86-NEXT: .secidx _y
-; X86-NEXT: .short 0
+; X86-NEXT: .short 1
 ; X86-NEXT: .long [[END_OF_Y]]-_y
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X86-NEXT: .long   0
@@ -131,6 +135,10 @@
 ; X86-NEXT: .long   8
 ; X86-NEXT: .long [[Y_RETURN]]-_y
 ; X86-NEXT: .long   9
+; X86-NEXT: .short 52
+; X86-NEXT: .short 52
+; X86-NEXT: .short 53
+; X86-NEXT: .short 53
 ; X86-NEXT: [[FILE_SEGMENT_END]]:
 ; X86-NEXT: [[F2_END]]:
 ; Symbol subsection for f
@@ -160,7 +168,7 @@
 ; X86-NEXT: [[F2_START]]:
 ; X86-NEXT: .secrel32 _f
 ; X86-NEXT: .secidx _f
-; X86-NEXT: .short 0
+; X86-NEXT: .short 1
 ; X86-NEXT: .long [[END_OF_F]]-_f
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X86-NEXT: .long   0
@@ -174,6 +182,14 @@
 ; X86-NEXT: .long   14
 ; X86-NEXT: .long [[F_RETURN]]-_f
 ; X86-NEXT: .long   15
+; X86-NEXT: .short 62
+; X86-NEXT: .short 62
+; X86-NEXT: .short 63
+; X86-NEXT: .short 63
+; X86-NEXT: .short 72
+; X86-NEXT: .short 72
+; X86-NEXT: .short 73
+; X86-NEXT: .short 73
 ; X86-NEXT: [[FILE_SEGMENT_END]]:
 ; X86-NEXT: [[F2_END]]:
 ; File index to string table offset subsection
@@ -198,14 +214,14 @@
 ; OBJ32-NEXT:   0x30 IMAGE_REL_I386_SECTION _x
 ; OBJ32-NEXT:   0x44 IMAGE_REL_I386_SECREL _x
 ; OBJ32-NEXT:   0x48 IMAGE_REL_I386_SECTION _x
-; OBJ32-NEXT:   0x94 IMAGE_REL_I386_SECREL _y
-; OBJ32-NEXT:   0x98 IMAGE_REL_I386_SECTION _y
-; OBJ32-NEXT:   0xAC IMAGE_REL_I386_SECREL _y
-; OBJ32-NEXT:   0xB0 IMAGE_REL_I386_SECTION _y
-; OBJ32-NEXT:   0xFC IMAGE_REL_I386_SECREL _f
-; OBJ32-NEXT:   0x100 IMAGE_REL_I386_SECTION _f
-; OBJ32-NEXT:   0x114 IMAGE_REL_I386_SECREL _f
-; OBJ32-NEXT:   0x118 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT:   0x9C IMAGE_REL_I386_SECREL _y
+; OBJ32-NEXT:   0xA0 IMAGE_REL_I386_SECTION _y
+; OBJ32-NEXT:   0xB4 IMAGE_REL_I386_SECREL _y
+; OBJ32-NEXT:   0xB8 IMAGE_REL_I386_SECTION _y
+; OBJ32-NEXT:   0x10C IMAGE_REL_I386_SECREL _f
+; OBJ32-NEXT:   0x110 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT:   0x124 IMAGE_REL_I386_SECREL _f
+; OBJ32-NEXT:   0x128 IMAGE_REL_I386_SECTION _f
 ; OBJ32-NEXT: ]
 ; OBJ32:      Subsection [
 ; OBJ32-NEXT:   Type: 0xF1
@@ -248,24 +264,35 @@
 ; OBJ32:      ]
 ; OBJ32:      FunctionLineTable [
 ; OBJ32-NEXT:   Name: _x
+; OBJ32-NEXT:   Flags: 0x1
 ; OBJ32-NEXT:   CodeSize: 0x6
 ; OBJ32-NEXT:   FilenameSegment [
 ; OBJ32-NEXT:     Filename: D:\source.c
 ; OBJ32-NEXT:     +0x0: 4
 ; OBJ32-NEXT:     +0x5: 5
+; OBJ32-NEXT:     ColStart: 42
+; OBJ32-NEXT:     ColEnd: 42
+; OBJ32-NEXT:     ColStart: 43
+; OBJ32-NEXT:     ColEnd: 43
 ; OBJ32-NEXT:   ]
 ; OBJ32-NEXT: ]
 ; OBJ32-NEXT: FunctionLineTable [
 ; OBJ32-NEXT:   Name: _y
+; OBJ32-NEXT:   Flags: 0x1
 ; OBJ32-NEXT:   CodeSize: 0x6
 ; OBJ32-NEXT:   FilenameSegment [
 ; OBJ32-NEXT:     Filename: D:\source.c
 ; OBJ32-NEXT:     +0x0: 8
 ; OBJ32-NEXT:     +0x5: 9
+; OBJ32-NEXT:     ColStart: 52
+; OBJ32-NEXT:     ColEnd: 52
+; OBJ32-NEXT:     ColStart: 53
+; OBJ32-NEXT:     ColEnd: 53
 ; OBJ32-NEXT:   ]
 ; OBJ32-NEXT: ]
 ; OBJ32-NEXT: FunctionLineTable [
 ; OBJ32-NEXT:   Name: _f
+; OBJ32-NEXT:   Flags: 0x1
 ; OBJ32-NEXT:   CodeSize: 0x10
 ; OBJ32-NEXT:   FilenameSegment [
 ; OBJ32-NEXT:     Filename: D:\source.c
@@ -273,6 +300,14 @@
 ; OBJ32-NEXT:     +0x5: 13
 ; OBJ32-NEXT:     +0xA: 14
 ; OBJ32-NEXT:     +0xF: 15
+; OBJ32-NEXT:     ColStart: 62
+; OBJ32-NEXT:     ColEnd: 62
+; OBJ32-NEXT:     ColStart: 63
+; OBJ32-NEXT:     ColEnd: 63
+; OBJ32-NEXT:     ColStart: 72
+; OBJ32-NEXT:     ColEnd: 72
+; OBJ32-NEXT:     ColStart: 73
+; OBJ32-NEXT:     ColEnd: 73
 ; OBJ32-NEXT:   ]
 ; OBJ32-NEXT: ]
 ; OBJ32:    }
@@ -349,7 +384,7 @@
 ; X64-NEXT: [[F2_START]]:
 ; X64-NEXT: .secrel32 x
 ; X64-NEXT: .secidx x
-; X64-NEXT: .short 0
+; X64-NEXT: .short 1
 ; X64-NEXT: .long [[END_OF_X]]-x
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X64-NEXT: .long   0
@@ -361,6 +396,12 @@
 ; X64-NEXT: .long   4
 ; X64-NEXT: .long [[X_EPILOG_AND_RET]]-x
 ; X64-NEXT: .long   5
+; X64-NEXT: .short 0
+; X64-NEXT: .short 0
+; X64-NEXT: .short 42
+; X64-NEXT: .short 42
+; X64-NEXT: .short 43
+; X64-NEXT: .short 43
 ; X64-NEXT: [[FILE_SEGMENT_END]]:
 ; X64-NEXT: [[F2_END]]:
 ; Symbol subsection for y
@@ -390,7 +431,7 @@
 ; X64-NEXT: [[F2_START]]:
 ; X64-NEXT: .secrel32 y
 ; X64-NEXT: .secidx y
-; X64-NEXT: .short 0
+; X64-NEXT: .short 1
 ; X64-NEXT: .long [[END_OF_Y]]-y
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X64-NEXT: .long   0
@@ -402,6 +443,12 @@
 ; X64-NEXT: .long   8
 ; X64-NEXT: .long [[Y_EPILOG_AND_RET]]-y
 ; X64-NEXT: .long   9
+; X64-NEXT: .short 0
+; X64-NEXT: .short 0
+; X64-NEXT: .short 52
+; X64-NEXT: .short 52
+; X64-NEXT: .short 53
+; X64-NEXT: .short 53
 ; X64-NEXT: [[FILE_SEGMENT_END]]:
 ; X64-NEXT: [[F2_END]]:
 ; Symbol subsection for f
@@ -431,7 +478,7 @@
 ; X64-NEXT: [[F2_START]]:
 ; X64-NEXT: .secrel32 f
 ; X64-NEXT: .secidx f
-; X64-NEXT: .short 0
+; X64-NEXT: .short 1
 ; X64-NEXT: .long [[END_OF_F]]-f
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X64-NEXT: .long   0
@@ -447,6 +494,16 @@
 ; X64-NEXT: .long   14
 ; X64-NEXT: .long [[F_EPILOG_AND_RET]]-f
 ; X64-NEXT: .long   15
+; X64-NEXT: .short 0
+; X64-NEXT: .short 0
+; X64-NEXT: .short 62
+; X64-NEXT: .short 62
+; X64-NEXT: .short 63
+; X64-NEXT: .short 63
+; X64-NEXT: .short 72
+; X64-NEXT: .short 72
+; X64-NEXT: .short 73
+; X64-NEXT: .short 73
 ; X64-NEXT: [[FILE_SEGMENT_END]]:
 ; X64-NEXT: [[F2_END]]:
 ; File index to string table offset subsection
@@ -471,14 +528,14 @@
 ; OBJ64-NEXT:   0x30 IMAGE_REL_AMD64_SECTION x
 ; OBJ64-NEXT:   0x44 IMAGE_REL_AMD64_SECREL x
 ; OBJ64-NEXT:   0x48 IMAGE_REL_AMD64_SECTION x
-; OBJ64-NEXT:   0x9C IMAGE_REL_AMD64_SECREL y
-; OBJ64-NEXT:   0xA0 IMAGE_REL_AMD64_SECTION y
-; OBJ64-NEXT:   0xB4 IMAGE_REL_AMD64_SECREL y
-; OBJ64-NEXT:   0xB8 IMAGE_REL_AMD64_SECTION y
-; OBJ64-NEXT:   0x10C IMAGE_REL_AMD64_SECREL f
-; OBJ64-NEXT:   0x110 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0xA8 IMAGE_REL_AMD64_SECREL y
+; OBJ64-NEXT:   0xAC IMAGE_REL_AMD64_SECTION y
+; OBJ64-NEXT:   0xC0 IMAGE_REL_AMD64_SECREL y
+; OBJ64-NEXT:   0xC4 IMAGE_REL_AMD64_SECTION y
 ; OBJ64-NEXT:   0x124 IMAGE_REL_AMD64_SECREL f
 ; OBJ64-NEXT:   0x128 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0x13C IMAGE_REL_AMD64_SECREL f
+; OBJ64-NEXT:   0x140 IMAGE_REL_AMD64_SECTION f
 ; OBJ64-NEXT: ]
 ; OBJ64:      Subsection [
 ; OBJ64-NEXT:   Type: 0xF1
@@ -521,26 +578,41 @@
 ; OBJ64:      ]
 ; OBJ64:      FunctionLineTable [
 ; OBJ64-NEXT:   Name: x
+; OBJ64-NEXT:   Flags: 0x1
 ; OBJ64-NEXT:   CodeSize: 0xE
 ; OBJ64-NEXT:   FilenameSegment [
 ; OBJ64-NEXT:     Filename: D:\source.c
 ; OBJ64-NEXT:     +0x0: 3
 ; OBJ64-NEXT:     +0x4: 4
 ; OBJ64-NEXT:     +0x9: 5
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
+; OBJ64-NEXT:     ColStart: 42
+; OBJ64-NEXT:     ColEnd: 42
+; OBJ64-NEXT:     ColStart: 43
+; OBJ64-NEXT:     ColEnd: 43
 ; OBJ64-NEXT:   ]
 ; OBJ64-NEXT: ]
 ; OBJ64-NEXT: FunctionLineTable [
 ; OBJ64-NEXT:   Name: y
+; OBJ64-NEXT:   Flags: 0x1
 ; OBJ64-NEXT:   CodeSize: 0xE
 ; OBJ64-NEXT:   FilenameSegment [
 ; OBJ64-NEXT:     Filename: D:\source.c
 ; OBJ64-NEXT:     +0x0: 7
 ; OBJ64-NEXT:     +0x4: 8
 ; OBJ64-NEXT:     +0x9: 9
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
+; OBJ64-NEXT:     ColStart: 52
+; OBJ64-NEXT:     ColEnd: 52
+; OBJ64-NEXT:     ColStart: 53
+; OBJ64-NEXT:     ColEnd: 53
 ; OBJ64-NEXT:   ]
 ; OBJ64-NEXT: ]
 ; OBJ64-NEXT: FunctionLineTable [
 ; OBJ64-NEXT:   Name: f
+; OBJ64-NEXT:   Flags: 0x1
 ; OBJ64-NEXT:   CodeSize: 0x18
 ; OBJ64-NEXT:   FilenameSegment [
 ; OBJ64-NEXT:     Filename: D:\source.c
@@ -549,6 +621,16 @@
 ; OBJ64-NEXT:     +0x9: 13
 ; OBJ64-NEXT:     +0xE: 14
 ; OBJ64-NEXT:     +0x13: 15
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
+; OBJ64-NEXT:     ColStart: 62
+; OBJ64-NEXT:     ColEnd: 62
+; OBJ64-NEXT:     ColStart: 63
+; OBJ64-NEXT:     ColEnd: 63
+; OBJ64-NEXT:     ColStart: 72
+; OBJ64-NEXT:     ColEnd: 72
+; OBJ64-NEXT:     ColStart: 73
+; OBJ64-NEXT:     ColEnd: 73
 ; OBJ64-NEXT:   ]
 ; OBJ64-NEXT: ]
 ; OBJ64:    }
@@ -599,11 +681,11 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "
 !11 = !{i32 2, !"Dwarf Version", i32 4}
 !12 = !{i32 1, !"Debug Info Version", i32 3}
 !13 = !{!"clang version 3.5 "}
-!14 = !DILocation(line: 4, scope: !4)
-!15 = !DILocation(line: 5, scope: !4)
-!16 = !DILocation(line: 8, scope: !9)
-!17 = !DILocation(line: 9, scope: !9)
-!18 = !DILocation(line: 12, scope: !10)
-!19 = !DILocation(line: 13, scope: !10)
-!20 = !DILocation(line: 14, scope: !10)
-!21 = !DILocation(line: 15, scope: !10)
+!14 = !DILocation(line: 4, column: 42, scope: !4)
+!15 = !DILocation(line: 5, column: 43, scope: !4)
+!16 = !DILocation(line: 8, column: 52, scope: !9)
+!17 = !DILocation(line: 9, column: 53, scope: !9)
+!18 = !DILocation(line: 12, column: 62, scope: !10)
+!19 = !DILocation(line: 13, column: 63, scope: !10)
+!20 = !DILocation(line: 14, column: 72, scope: !10)
+!21 = !DILocation(line: 15, column: 73, scope: !10)
diff --git a/test/DebugInfo/COFF/simple.ll b/test/DebugInfo/COFF/simple.ll
index 9cb1d1e1d904..0d9857c7831d 100644
--- a/test/DebugInfo/COFF/simple.ll
+++ b/test/DebugInfo/COFF/simple.ll
@@ -49,7 +49,7 @@
 ; X86-NEXT: [[F2_START]]:
 ; X86-NEXT: .secrel32 _f
 ; X86-NEXT: .secidx _f
-; X86-NEXT: .short  0
+; X86-NEXT: .short  1
 ; X86-NEXT: .long [[END_OF_F]]-_f
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X86-NEXT: .long   0
@@ -59,6 +59,10 @@
 ; X86-NEXT: .long   4
 ; X86-NEXT: .long [[RETURN_STMT]]-_f
 ; X86-NEXT: .long   5
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
+; X86-NEXT: .short  0
 ; X86-NEXT: [[FILE_SEGMENT_END]]:
 ; X86-NEXT: [[F2_END]]:
 ; File index to string table offset subsection
@@ -97,11 +101,16 @@
 ; OBJ32-NEXT: ]
 ; OBJ32:      FunctionLineTable [
 ; OBJ32-NEXT:   Name: _f
+; OBJ32-NEXT:   Flags: 0x1
 ; OBJ32-NEXT:   CodeSize: 0x6
 ; OBJ32-NEXT:   FilenameSegment [
 ; OBJ32-NEXT:     Filename: D:\test.c
 ; OBJ32-NEXT:     +0x0: 4
 ; OBJ32-NEXT:     +0x5: 5
+; OBJ32-NEXT:     ColStart: 0
+; OBJ32-NEXT:     ColEnd: 0
+; OBJ32-NEXT:     ColStart: 0
+; OBJ32-NEXT:     ColEnd: 0
 ; OBJ32-NEXT:   ]
 ; OBJ32-NEXT: ]
 ; OBJ32:    }
@@ -148,7 +157,7 @@
 ; X64-NEXT: [[F2_START]]:
 ; X64-NEXT: .secrel32 f
 ; X64-NEXT: .secidx f
-; X64-NEXT: .short  0
+; X64-NEXT: .short  1
 ; X64-NEXT: .long [[END_OF_F]]-f
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X64-NEXT: .long   0
@@ -160,6 +169,12 @@
 ; X64-NEXT: .long   4
 ; X64-NEXT: .long [[EPILOG_AND_RET]]-f
 ; X64-NEXT: .long   5
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
+; X64-NEXT: .short  0
 ; X64-NEXT: [[FILE_SEGMENT_END]]:
 ; X64-NEXT: [[F2_END]]:
 ; File index to string table offset subsection
@@ -198,12 +213,19 @@
 ; OBJ64-NEXT: ]
 ; OBJ64:      FunctionLineTable [
 ; OBJ64-NEXT:   Name: f
+; OBJ64-NEXT:   Flags: 0x1
 ; OBJ64-NEXT:   CodeSize: 0xE
 ; OBJ64-NEXT:   FilenameSegment [
 ; OBJ64-NEXT:     Filename: D:\test.c
 ; OBJ64-NEXT:     +0x0: 3
 ; OBJ64-NEXT:     +0x4: 4
 ; OBJ64-NEXT:     +0x9: 5
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
+; OBJ64-NEXT:     ColStart: 0
+; OBJ64-NEXT:     ColEnd: 0
 ; OBJ64-NEXT:   ]
 ; OBJ64-NEXT: ]
 ; OBJ64:    }
diff --git a/test/ExecutionEngine/RuntimeDyld/Mips/ELF_O32R6_relocations.s b/test/ExecutionEngine/RuntimeDyld/Mips/ELF_O32R6_relocations.s
new file mode 100644
index 000000000000..3d20db21964a
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/Mips/ELF_O32R6_relocations.s
@@ -0,0 +1,49 @@
+# RUN: llvm-mc -triple=mipsel-unknown-linux -mcpu=mips32r6 -relocation-model=pic -code-model=small -filetype=obj -o %T/test_ELF_O32R6.o %s
+# RUN: llc -mtriple=mipsel-unknown-linux -mcpu=mips32r6 -relocation-model=pic -filetype=obj -o %T/test_ELF_ExternalFunction_O32R6.o %S/Inputs/ExternalFunction.ll
+# RUN: llvm-rtdyld -triple=mipsel-unknown-linux -mcpu=mips32r6 -verify -map-section test_ELF_O32R6.o,.text=0x1000 -map-section test_ELF_ExternalFunction_O32R6.o,.text=0x10000 -check=%s %/T/test_ELF_O32R6.o %T/test_ELF_ExternalFunction_O32R6.o
+
+# RUN: llvm-mc -triple=mips-unknown-linux -mcpu=mips32r6 -relocation-model=pic -code-model=small -filetype=obj -o %T/test_ELF_O32R6.o %s
+# RUN: llc -mtriple=mips-unknown-linux -mcpu=mips32r6 -relocation-model=pic -filetype=obj -o %T/test_ELF_ExternalFunction_O32R6.o %S/Inputs/ExternalFunction.ll
+# RUN: llvm-rtdyld -triple=mips-unknown-linux -mcpu=mips32r6 -verify -map-section test_ELF_O32R6.o,.text=0x1000 -map-section test_ELF_ExternalFunction_O32R6.o,.text=0x10000 -check=%s %/T/test_ELF_O32R6.o %T/test_ELF_ExternalFunction_O32R6.o
+
+	.text
+	.abicalls
+	.nan	2008
+	.text
+	.set	nomicromips
+	.set	nomips16
+	.set	noreorder
+	.set	nomacro
+	.set	noat
+
+	.align	3
+	.globl	bar
+	.type	bar,@function
+
+bar:
+# Test R_MIPS_PC19_S2 relocation.
+# rtdyld-check:  decode_operand(R_MIPS_PC19_S2, 1)[20:0] = (foo - R_MIPS_PC19_S2)[20:0]
+R_MIPS_PC19_S2:
+	lwpc $6,foo
+
+# Test R_MIPS_PC21_S2 relocation.
+# rtdyld-check:  decode_operand(R_MIPS_PC21_S2, 1)[22:0] = (foo - next_pc(R_MIPS_PC21_S2))[22:0]
+R_MIPS_PC21_S2:
+	bnezc	$5,foo
+
+# Test R_MIPS_PC26_S2 relocation.
+# rtdyld-check:  decode_operand(R_MIPS_PC26_S2, 0)[27:0] = (foo - next_pc(R_MIPS_PC26_S2))[27:0]
+R_MIPS_PC26_S2:
+	balc	foo
+
+# Test R_MIPS_PCHI16 relocation.
+# rtdyld-check:  decode_operand(R_MIPS_PCHI16, 1)[15:0] = (foo - R_MIPS_PCHI16 + 0x8000)[31:16]
+R_MIPS_PCHI16:
+	aluipc $5, %pcrel_hi(foo)
+
+# Test R_MIPS_PCLO16 relocation.
+# rtdyld-check:  decode_operand(R_MIPS_PCLO16, 2)[15:0] = (foo - R_MIPS_PCLO16)[15:0]
+R_MIPS_PCLO16:
+	addiu  $5, $5, %pcrel_lo(foo)
+
+	.size	bar, .-bar
diff --git a/test/ExecutionEngine/RuntimeDyld/Mips/ELF_O32_PIC_relocations.s b/test/ExecutionEngine/RuntimeDyld/Mips/ELF_O32_PIC_relocations.s
index a4b145ab5172..6c47262246ab 100644
--- a/test/ExecutionEngine/RuntimeDyld/Mips/ELF_O32_PIC_relocations.s
+++ b/test/ExecutionEngine/RuntimeDyld/Mips/ELF_O32_PIC_relocations.s
@@ -39,6 +39,11 @@ R_MIPS_26:
 	j   foo
 	nop
 
+# rtdyld-check:  decode_operand(R_MIPS_PC16, 1)[17:0] = (foo - R_MIPS_PC16)[17:0]
+R_MIPS_PC16:
+	bal   foo
+	nop
+
 # rtdyld-check:  decode_operand(R_MIPS_HI16, 1)[15:0] = foo[31:16]
 R_MIPS_HI16:
 	lui	$1, %hi(foo)
diff --git a/test/ExecutionEngine/RuntimeDyld/X86/MachO_i386_DynNoPIC_relocations.s b/test/ExecutionEngine/RuntimeDyld/X86/MachO_i386_DynNoPIC_relocations.s
index 6b2fe9532c2f..17c332a11c33 100644
--- a/test/ExecutionEngine/RuntimeDyld/X86/MachO_i386_DynNoPIC_relocations.s
+++ b/test/ExecutionEngine/RuntimeDyld/X86/MachO_i386_DynNoPIC_relocations.s
@@ -1,7 +1,10 @@
 # RUN: llvm-mc -triple=i386-apple-macosx10.4 -relocation-model=dynamic-no-pic -filetype=obj -o %T/test_i386.o %s
 # RUN: llvm-rtdyld -triple=i386-apple-macosx10.4 -verify -check=%s %/T/test_i386.o
 
-	.section	__TEXT,__text,regular,pure_instructions
+// Put the section used in the test at a non zero address.
+	.long 4
+
+	.section	__TEXT,__text2,regular,pure_instructions
 	.globl	bar
 	.align	4, 0x90
 bar:
diff --git a/test/ExecutionEngine/RuntimeDyld/X86/MachO_x86-64_PIC_relocations.s b/test/ExecutionEngine/RuntimeDyld/X86/MachO_x86-64_PIC_relocations.s
index f28e4d245994..2ef8cc439df1 100644
--- a/test/ExecutionEngine/RuntimeDyld/X86/MachO_x86-64_PIC_relocations.s
+++ b/test/ExecutionEngine/RuntimeDyld/X86/MachO_x86-64_PIC_relocations.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -relocation-model=pic -filetype=obj -o %T/test_x86-64.o %s
-# RUN: llvm-rtdyld -triple=x86_64-apple-macosx10.9 -verify -check=%s %/T/test_x86-64.o
+# RUN: llvm-rtdyld -triple=x86_64-apple-macosx10.9 -dummy-extern ds1=0xfffffffffffffffe -dummy-extern ds2=0xffffffffffffffff -verify -check=%s %/T/test_x86-64.o
 
         .section	__TEXT,__text,regular,pure_instructions
 	.globl	foo
@@ -46,4 +46,15 @@ eh_frame_test:
 x:
         .long   5
 
+# Test dummy-extern relocation.
+# rtdyld-check: *{8}z1 = ds1
+z1:
+        .quad   ds1
+
+# Test external-symbol relocation bypass: symbols with addr 0xffffffffffffffff
+# don't have their relocations applied.
+# rtdyld-check: *{8}z2 = 0
+z2:
+        .quad   ds2
+
 .subsections_via_symbols
diff --git a/test/LibDriver/infer-output-path.test b/test/LibDriver/infer-output-path.test
new file mode 100644
index 000000000000..7a1bbcbbd18d
--- /dev/null
+++ b/test/LibDriver/infer-output-path.test
@@ -0,0 +1,15 @@
+RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %T/a.obj %S/Inputs/a.s
+RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %T/b.o %S/Inputs/b.s
+RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %T/c %S/Inputs/b.s
+
+RUN: rm -f %T/a.lib
+RUN: llvm-lib %T/a.obj
+RUN: test -e %T/a.lib
+
+RUN: rm -f %T/b.lib
+RUN: llvm-lib /libpath:%T b.o
+RUN: test -e %T/b.lib
+
+RUN: rm -f %T/c.lib
+RUN: llvm-lib /libpath:%T c
+RUN: test -e %T/c.lib
diff --git a/test/MC/AArch64/basic-a64-instructions.s b/test/MC/AArch64/basic-a64-instructions.s
index 5d33a4f933b3..f8e49432145b 100644
--- a/test/MC/AArch64/basic-a64-instructions.s
+++ b/test/MC/AArch64/basic-a64-instructions.s
@@ -1489,6 +1489,9 @@ _func:
 // CHECK: clz	w24, wzr                     // encoding: [0xf8,0x13,0xc0,0x5a]
 // CHECK: rev	x22, xzr                     // encoding: [0xf6,0x0f,0xc0,0xda]
 
+	rev64	x13, x12
+// CHECK: rev	x13, x12                     // encoding: [0x8d,0x0d,0xc0,0xda]
+
 //------------------------------------------------------------------------------
 // Data-processing (2 source)
 //------------------------------------------------------------------------------
diff --git a/test/MC/ARM/basic-thumb2-instructions.s b/test/MC/ARM/basic-thumb2-instructions.s
index 05e0b2b574e9..c289325d6d12 100644
--- a/test/MC/ARM/basic-thumb2-instructions.s
+++ b/test/MC/ARM/basic-thumb2-instructions.s
@@ -49,7 +49,6 @@ _func:
         adcs	r0, r1, r3, lsl #7
         adc.w	r0, r1, r3, lsr #31
         adcs.w	r0, r1, r3, asr #32
-        add r2, sp, ip
 
 @ CHECK: adc.w	r4, r5, r6              @ encoding: [0x45,0xeb,0x06,0x04]
 @ CHECK: adcs.w	r4, r5, r6              @ encoding: [0x55,0xeb,0x06,0x04]
@@ -59,7 +58,6 @@ _func:
 @ CHECK: adcs.w	r0, r1, r3, lsl #7      @ encoding: [0x51,0xeb,0xc3,0x10]
 @ CHECK: adc.w	r0, r1, r3, lsr #31     @ encoding: [0x41,0xeb,0xd3,0x70]
 @ CHECK: adcs.w	r0, r1, r3, asr #32     @ encoding: [0x51,0xeb,0x23,0x00]
-@ CHECK: add.w	r2, sp, r12             @ encoding: [0x0d,0xeb,0x0c,0x02]
 
 
 @------------------------------------------------------------------------------
@@ -115,23 +113,99 @@ _func:
 
 
 @------------------------------------------------------------------------------
-@ ADD (register)
+@ ADD (register, not SP) A8.8.6
 @------------------------------------------------------------------------------
         add r1, r2, r8
         add r5, r9, r2, asr #32
         adds r7, r3, r1, lsl #31
         adds.w r0, r3, r6, lsr #25
         add.w r4, r8, r1, ror #12
+        adds r1, r1, r7              // T1
+        it eq
+        addeq r1, r3, r5             // T1
+        it eq
+        addeq r1, r1, r5             // T1
+        it eq
+        addseq r1, r3, r5            // T3
+        it eq
+        addseq r1, r1, r5            // T3
         add r10, r8
         add r10, r10, r8
+        it eq
+        addeq r1, r10                // T2
+        it eq
+        addseq r1, r10               // T3
 
 @ CHECK: add.w	r1, r2, r8              @ encoding: [0x02,0xeb,0x08,0x01]
 @ CHECK: add.w	r5, r9, r2, asr #32     @ encoding: [0x09,0xeb,0x22,0x05]
 @ CHECK: adds.w	r7, r3, r1, lsl #31     @ encoding: [0x13,0xeb,0xc1,0x77]
 @ CHECK: adds.w	r0, r3, r6, lsr #25     @ encoding: [0x13,0xeb,0x56,0x60]
 @ CHECK: add.w	r4, r8, r1, ror #12     @ encoding: [0x08,0xeb,0x31,0x34]
+@ CHECK: adds r1, r1, r7                @ encoding: [0xc9,0x19]
+@ CHECK: it eq                          @ encoding: [0x08,0xbf]
+@ CHECK: addeq r1, r3, r5               @ encoding: [0x59,0x19]
+@ CHECK: it eq                          @ encoding: [0x08,0xbf]
+@ CHECK: addeq r1, r1, r5               @ encoding: [0x49,0x19]
+@ CHECK: it eq                          @ encoding: [0x08,0xbf]
+@ CHECK: addseq.w r1, r3, r5            @ encoding: [0x13,0xeb,0x05,0x01]
+@ CHECK: it eq                          @ encoding: [0x08,0xbf]
+@ CHECK: addseq.w r1, r1, r5            @ encoding: [0x11,0xeb,0x05,0x01]
 @ CHECK: add	r10, r8                 @ encoding: [0xc2,0x44]
 @ CHECK: add	r10, r8                 @ encoding: [0xc2,0x44]
+@ CHECK: it eq                          @ encoding: [0x08,0xbf]
+@ CHECK: addeq r1, r10                  @ encoding: [0x51,0x44]
+@ CHECK: it eq                          @ encoding: [0x08,0xbf]
+@ CHECK: addseq.w r1, r1, r10           @ encoding: [0x11,0xeb,0x0a,0x01]
+
+@------------------------------------------------------------------------------
+@ ADD (SP plus immediate) A8.8.9
+@------------------------------------------------------------------------------
+        it eq
+@ CHECK: it eq                          @ encoding: [0x08,0xbf]
+        addeq r7, sp, #1020          // T1
+@ CHECK: addeq	r7, sp, #1020           @ encoding: [0xff,0xaf]
+
+        it eq
+@ CHECK: it eq                          @ encoding: [0x08,0xbf]
+        addeq sp, sp, #508           // T2
+@ FIXME: ARMARM says 'addeq sp, sp, #508'
+@ CHECK: addeq	sp, #508                @ encoding: [0x7f,0xb0]
+
+        add r7, sp, #15              // T3
+@ CHECK: add.w	r7, sp, #15             @ encoding: [0x0d,0xf1,0x0f,0x07]
+        adds r7, sp, #16             // T3
+@ CHECK: adds.w	r7, sp, #16             @ encoding: [0x1d,0xf1,0x10,0x07]
+        add r8, sp, #16              // T3
+@ CHECK: add.w	r8, sp, #16             @ encoding: [0x0d,0xf1,0x10,0x08]
+
+        addw r6, sp, #1020           // T4
+@ CHECK: addw	r6, sp, #1020           @ encoding: [0x0d,0xf2,0xfc,0x36]
+        add r6, sp, #1019            // T4
+@ CHECK: addw	r6, sp, #1019           @ encoding: [0x0d,0xf2,0xfb,0x36]
+
+@------------------------------------------------------------------------------
+@ ADD (SP plus register) A8.8.10
+@------------------------------------------------------------------------------
+        it eq
+@ CHECK: it eq                          @ encoding: [0x08,0xbf]
+        addeq r8, sp, r8             // T1
+@ CHECK: addeq	r8, sp, r8              @ encoding: [0xe8,0x44]
+        it eq
+@ CHECK: it eq                          @ encoding: [0x08,0xbf]
+        addeq r8, sp                 // T1
+@ CHECK: addeq	r8, sp                  @ encoding: [0xe8,0x44]
+
+        it eq
+@ CHECK: it eq                          @ encoding: [0x08,0xbf]
+        addeq sp, r9                 // T2
+@ CHECK: addeq	sp, r9                  @ encoding: [0xcd,0x44]
+
+        add r2, sp, ip               // T3
+@ CHECK: add.w r2, sp, r12              @ encoding: [0x0d,0xeb,0x0c,0x02]
+        it eq
+@ CHECK: it eq                          @ encoding: [0x08,0xbf]
+        addeq r2, sp, ip             // T3
+@ CHECK: addeq.w r2, sp, r12            @ encoding: [0x0d,0xeb,0x0c,0x02]
 
 
 @------------------------------------------------------------------------------
diff --git a/test/MC/ARM/thumb2-narrow-dp.ll b/test/MC/ARM/thumb2-narrow-dp.ll
index ae2ba355d41f..050e97133997 100644
--- a/test/MC/ARM/thumb2-narrow-dp.ll
+++ b/test/MC/ARM/thumb2-narrow-dp.ll
@@ -6,11 +6,102 @@
 // Assemblers should chose the narrow thumb encoding when possible, i.e.
 //   - Rd == Rn 
 //   - Rd, Rn and Rm are < r8
-// In addition, some operations are commutative, allowing the transormation 
+// In addition, some operations are commutative, allowing the transformation
 // when:
 //   - Rd == Rn || Rd == Rm
 //   - Rd, Rn and Rm are < r8
 
+// ADD immediate (not SP) A8.8.4
+    ADDS     r0, r0, #5          // T1
+// CHECK: adds  r0, r0, #5          @ encoding: [0x40,0x1d]
+    ADDS     r1, r1, #8          // T2
+// CHECK: adds  r1, #8              @ encoding: [0x08,0x31]
+    ADDS.W   r1, r1, #8          // .w => T3
+// CHECK: adds.w r1, r1, #8         @ encoding: [0x11,0xf1,0x08,0x01]
+    ADDS     r8, r8, #8          // T3
+// CHECK: adds.w r8, r8, #8         @ encoding: [0x18,0xf1,0x08,0x08]
+
+    IT EQ
+// CHECK: it eq                     @ encoding: [0x08,0xbf]
+    ADDEQ    r0, r0, #5          // T1
+// CHECK: addeq r0, r0, #5          @ encoding: [0x40,0x1d]
+    IT EQ
+// CHECK: it eq                     @ encoding: [0x08,0xbf]
+    ADDEQ    r1, r1, #8          // T2
+// CHECK: addeq r1, #8              @ encoding: [0x08,0x31]
+
+    IT EQ
+// CHECK: it eq                     @ encoding: [0x08,0xbf]
+    ADDSEQ   r0, r0, #5          // T3
+// CHECK: addseq.w r0, r0, #5       @ encoding: [0x10,0xf1,0x05,0x00]
+    IT EQ
+// CHECK: it eq                     @ encoding: [0x08,0xbf]
+    ADDSEQ   r1, r1, #8          // T3
+// CHECK: addseq.w r1, r1, #8       @ encoding: [0x11,0xf1,0x08,0x01]
+
+// ADD register (not SP) A8.8.6 (commutative)
+    ADDS     r0, r2, r1          // ADDS has T1 narrow 3 operand
+// CHECK: adds  r0, r2, r1          @ encoding: [0x50,0x18]
+    ADDS     r2, r2, r1          // ADDS has T1 narrow 3 operand
+// CHECK: adds  r2, r2, r1          @ encoding: [0x52,0x18]
+    ADD      r3, r1, r3          // T2
+// CHECK: add  r3, r1               @ encoding: [0x0b,0x44]
+
+    IT EQ
+// CHECK: it eq                     @ encoding: [0x08,0xbf]
+    ADDEQ    r0, r2, r1          // (In IT) ADD has T1 narrow 3 operand
+// CHECK: addeq r0, r2, r1          @ encoding: [0x50,0x18]
+    IT EQ
+// CHECK: it eq                     @ encoding: [0x08,0xbf]
+    ADDEQ    r2, r2, r1          // (In IT) ADD has T1 narrow 3 operand
+// CHECK: addeq r2, r2, r1          @ encoding: [0x52,0x18]
+
+    IT EQ
+// CHECK: it eq                     @ encoding: [0x08,0xbf]
+    ADDSEQ   r0, r2, r1          // T3
+// CHECK: addseq.w r0, r2, r1       @ encoding: [0x12,0xeb,0x01,0x00]
+    IT EQ
+// CHECK: it eq                     @ encoding: [0x08,0xbf]
+    ADDSEQ   r2, r2, r1          // T3
+// CHECK: addseq.w r2, r2, r1       @ encoding: [0x12,0xeb,0x01,0x02]
+
+    ADD      r3, r3, r1          // T2
+// CHECK: add  r3, r1               @ encoding: [0x0b,0x44]
+    ADD      r4, r4, pc          // T2
+// CHECK: add  r4, pc               @ encoding: [0x7c,0x44]
+    ADD      r4, pc, r4          // T2
+// CHECK: add  r4, pc               @ encoding: [0x7c,0x44]
+    ADD      pc, pc, r2          // T2
+// CHECK: add  pc, r2               @ encoding: [0x97,0x44]
+    ADD      pc, r2, pc          // T2
+// CHECK: add  pc, r2               @ encoding: [0x97,0x44]
+    ADD      pc, pc, sp          // T2
+// CHECK: add   pc, sp              @ encoding: [0xef,0x44]
+    ADD      pc, sp, pc          // T2
+// CHECK: add   pc, sp, pc          @ encoding: [0xef,0x44]
+
+// ADD (SP plus immediate) A8.8.9
+    ADD      sp, sp, #20         // T2
+// FIXME: ARMARM says 'add   sp, sp, #20'
+// CHECK: add   sp, #20             @ encoding: [0x05,0xb0]
+    ADD      sp, sp, #508        // T2
+// CHECK: add   sp, #508            @ encoding: [0x7f,0xb0]
+    ADD      sp, sp, #512        // T3
+// CHECK: add.w sp, sp, #512        @ encoding: [0x0d,0xf5,0x00,0x7d]
+
+// ADD (SP plus register) A8.8.10 (commutative)
+    ADD      r9, sp, r9          // T1
+// CHECK: add   r9, sp, r9          @ encoding: [0xe9,0x44]
+    ADD      r9, r9, sp          // T1
+// FIXME: ARMARM says 'add   r9, sp, r9'
+// CHECK: add   r9, sp              @ encoding: [0xe9,0x44]
+    ADD      sp, sp, r10         // T2
+// CHECK: add   sp, r10             @ encoding: [0xd5,0x44]
+    ADD      sp, r10, sp         // T2
+// CHECK: add   sp, r10             @ encoding: [0xd5,0x44]
+    ADD      sp, sp, pc          // T2
+// CHECK: add   sp, pc              @ encoding: [0xfd,0x44]
+
 // AND (commutative)
     ANDS     r0, r2, r1          // Must be wide - 3 distinct registers
     ANDS     r2, r2, r1          // Should choose narrow
diff --git a/test/MC/ARM/thumb_rewrites.s b/test/MC/ARM/thumb_rewrites.s
index c9d625e60de7..06c77e89862f 100644
--- a/test/MC/ARM/thumb_rewrites.s
+++ b/test/MC/ARM/thumb_rewrites.s
@@ -1,4 +1,11 @@
 @ RUN: llvm-mc -triple thumbv6m -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple thumbv7m -show-encoding < %s | FileCheck %s
+
+    adds    r1, r1, #3
+@ CHECK: adds   r1, r1, #3          @ encoding: [0xc9,0x1c]
+
+    adds    r1, #3
+@ CHECK: adds   r1, #3              @ encoding: [0x03,0x31]
 
     adds    r0, r0, #8
 @ CHECK: adds   r0, #8              @ encoding: [0x08,0x30]
@@ -9,9 +16,25 @@
     add     r0, r0, r8
 @ CHECK: add    r0, r8              @ encoding: [0x40,0x44]
 
+    add     r1, r8, r1
+@ CHECK: add    r1, r8              @ encoding: [0x41,0x44]
+
     add     sp, sp, r0
 @ CHECK: add    sp, r0              @ encoding: [0x85,0x44]
 
+    add     r4, sp, r4
+@ CHECK: add    r4, sp, r4          @ encoding: [0x6c,0x44]
+
+    add     r4, r4, sp
+@ CHECK: add    r4, sp              @ encoding: [0x6c,0x44]
+
+    add     sp, sp, #32
+@ FIXME: ARMARM says 'add   sp, sp, #32'
+@ CHECK: add    sp, #32             @ encoding: [0x08,0xb0]
+
+    add     r5, sp, #1016
+@ CHECK: add    r5, sp, #1016       @ encoding: [0xfe,0xad]
+
     add     r0, r0, r1
 @ CHECK: add    r0, r1              @ encoding: [0x08,0x44]
 
@@ -21,12 +44,30 @@
     subs    r0, r0, r0
 @ CHECK: subs   r0, r0, r0          @ encoding: [0x00,0x1a]
 
+    subs    r3, r3, #5
+@ CHECK: subs   r3, r3, #5          @ encoding: [0x5b,0x1f]
+
+    subs    r3, #5
+@ CHECK: subs   r3, #5              @ encoding: [0x05,0x3b]
+
+    subs    r2, r2, #8
+@ CHECK: subs   r2, #8              @ encoding: [0x08,0x3a]
+
+    sub     sp, sp, #16
+@ CHECK: sub    sp, #16             @ encoding: [0x84,0xb0]
+
+    ands    r0, r1, r0
+@ CHECK: ands   r0, r1              @ encoding: [0x08,0x40]
+
     ands    r0, r0, r1
 @ CHECK: ands   r0, r1              @ encoding: [0x08,0x40]
 
     eors    r0, r0, r1
 @ CHECK: eors   r0, r1              @ encoding: [0x48,0x40]
 
+    eors    r0, r1, r0
+@ CHECK: eors   r0, r1              @ encoding: [0x48,0x40]
+
     lsls    r0, r0, r1
 @ CHECK: lsls   r0, r1              @ encoding: [0x88,0x40]
 
@@ -39,6 +80,9 @@
     adcs    r0, r0, r1
 @ CHECK: adcs   r0, r1              @ encoding: [0x48,0x41]
 
+    adcs    r0, r1, r0
+@ CHECK: adcs   r0, r1              @ encoding: [0x48,0x41]
+
     sbcs    r0, r0, r1
 @ CHECK: sbcs   r0, r1              @ encoding: [0x88,0x41]
 
@@ -48,5 +92,8 @@
     orrs    r0, r0, r1
 @ CHECK: orrs   r0, r1              @ encoding: [0x08,0x43]
 
+    orrs    r0, r1, r0
+@ CHECK: orrs   r0, r1              @ encoding: [0x08,0x43]
+
     bics    r0, r0, r1
 @ CHECK: bics   r0, r1              @ encoding: [0x88,0x43]
diff --git a/test/MC/COFF/safeseh.s b/test/MC/COFF/safeseh.s
new file mode 100644
index 000000000000..d21628daff5f
--- /dev/null
+++ b/test/MC/COFF/safeseh.s
@@ -0,0 +1,6 @@
+// RUN: llvm-mc -triple i686-pc-win32 %s | FileCheck %s
+
+// check that we quote the output of .safeseh
+
+.safeseh "\01foo"
+// CHECK: .safeseh "\01foo"
diff --git a/test/MC/ELF/relax-arith.s b/test/MC/ELF/relax-arith.s
index d4f37a9ddf9f..15e44ebff7ef 100644
--- a/test/MC/ELF/relax-arith.s
+++ b/test/MC/ELF/relax-arith.s
@@ -115,3 +115,11 @@ bar:
         cmpl $foo, bar
         cmp  $foo, %rbx
         cmpq $foo, bar
+
+// CHECK:      Disassembly of section push:
+// CHECK-NEXT: push:
+// CHECK-NEXT:   0: 66 68 00 00                          pushw $0
+// CHECK-NEXT:   4: 68 00 00 00 00                       pushq $0
+        .section push,"x"
+        pushw $foo
+        push  $foo
diff --git a/test/MC/ELF/relax-arith2.s b/test/MC/ELF/relax-arith2.s
index a6c55adf894b..b05418482e32 100644
--- a/test/MC/ELF/relax-arith2.s
+++ b/test/MC/ELF/relax-arith2.s
@@ -116,3 +116,15 @@ bar:
         cmpl $1, bar
         cmp  $-1, %rbx
         cmpq $42, bar
+
+// CHECK:      Disassembly of section push:
+// CHECK-NEXT: push:
+// CHECK-NEXT:   0: 66 6a 80                      pushw $-128
+// CHECK-NEXT:   3: 66 6a 7f                      pushw $127
+// CHECK-NEXT:   6: 6a 80                         pushq $-128
+// CHECK-NEXT:   8: 6a 7f                         pushq $127
+        .section push,"x"
+        pushw $-128
+        pushw $127
+        push  $-128
+        push  $127
diff --git a/test/MC/ELF/relax-arith4.s b/test/MC/ELF/relax-arith4.s
new file mode 100644
index 000000000000..3fd3cadb76ab
--- /dev/null
+++ b/test/MC/ELF/relax-arith4.s
@@ -0,0 +1,25 @@
+// RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu %s -o - | llvm-objdump -d - | FileCheck  %s
+
+// Test for proper instruction relaxation behavior for the push $imm
+// instruction forms. This is the 32-bit version of the push $imm tests from
+// relax-arith.s and relax-arith2.s.
+
+// CHECK:      Disassembly of section push8:
+// CHECK-NEXT: push8:
+// CHECK-NEXT:   0: 66 6a 80                      pushw $-128
+// CHECK-NEXT:   3: 66 6a 7f                      pushw $127
+// CHECK-NEXT:   6: 6a 80                         pushl $-128
+// CHECK-NEXT:   8: 6a 7f                         pushl $127
+        .section push8,"x"
+        pushw $-128
+        pushw $127
+        push  $-128
+        push  $127
+
+// CHECK:      Disassembly of section push32:
+// CHECK-NEXT: push32:
+// CHECK-NEXT:   0: 66 68 00 00                   pushw $0
+// CHECK-NEXT:   4: 68 00 00 00 00                pushl $0
+        .section push32,"x"
+        pushw $foo
+        push  $foo
diff --git a/test/MC/Mips/macro-la-bad.s b/test/MC/Mips/macro-la-bad.s
new file mode 100644
index 000000000000..89d334030ec6
--- /dev/null
+++ b/test/MC/Mips/macro-la-bad.s
@@ -0,0 +1,17 @@
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1 --check-prefix=32-BIT
+# RUN: not llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n32 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=64-BIT --check-prefix=N32-ONLY
+# RUN: not llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n64 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=64-BIT --check-prefix=N64-ONLY
+
+  .text
+  la $5, 0x100000000
+  # 32-BIT: :[[@LINE-1]]:3: error: instruction requires a 32-bit immediate
+  # 64-BIT: :[[@LINE-2]]:3: error: instruction requires a 32-bit immediate
+  la $5, 0x100000000($6)
+  # 32-BIT: :[[@LINE-1]]:3: error: instruction requires a 32-bit immediate
+  # 64-BIT: :[[@LINE-2]]:3: error: instruction requires a 32-bit immediate
+  la $5, symbol
+  # N64-ONLY: :[[@LINE-1]]:3: warning: instruction loads the 32-bit address of a 64-bit symbol
+  # N32-ONLY-NOT: :[[@LINE-2]]:3: warning: instruction loads the 32-bit address of a 64-bit symbol
diff --git a/test/MC/Mips/macro-la.s b/test/MC/Mips/macro-la.s
new file mode 100644
index 000000000000..8c183a7b23e4
--- /dev/null
+++ b/test/MC/Mips/macro-la.s
@@ -0,0 +1,263 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r2 | \
+# RUN:   FileCheck %s
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 | \
+# RUN:   FileCheck %s
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r2 | \
+# RUN:   FileCheck %s
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 | \
+# RUN:   FileCheck %s
+
+la $5, 0x00000001 # CHECK: addiu $5, $zero, 1      # encoding: [0x24,0x05,0x00,0x01]
+la $5, 0x00000002 # CHECK: addiu $5, $zero, 2      # encoding: [0x24,0x05,0x00,0x02]
+la $5, 0x00004000 # CHECK: addiu $5, $zero, 16384  # encoding: [0x24,0x05,0x40,0x00]
+la $5, 0x00008000 # CHECK: ori   $5, $zero, 32768  # encoding: [0x34,0x05,0x80,0x00]
+la $5, 0xffffffff # CHECK: addiu $5, $zero, -1     # encoding: [0x24,0x05,0xff,0xff]
+la $5, 0xfffffffe # CHECK: addiu $5, $zero, -2     # encoding: [0x24,0x05,0xff,0xfe]
+la $5, 0xffffc000 # CHECK: addiu $5, $zero, -16384 # encoding: [0x24,0x05,0xc0,0x00]
+la $5, 0xffff8000 # CHECK: addiu $5, $zero, -32768 # encoding: [0x24,0x05,0x80,0x00]
+
+la $5, 0x00010000 # CHECK: lui $5, 1      # encoding: [0x3c,0x05,0x00,0x01]
+la $5, 0x00020000 # CHECK: lui $5, 2      # encoding: [0x3c,0x05,0x00,0x02]
+la $5, 0x40000000 # CHECK: lui $5, 16384  # encoding: [0x3c,0x05,0x40,0x00]
+la $5, 0x80000000 # CHECK: lui $5, 32768  # encoding: [0x3c,0x05,0x80,0x00]
+la $5, 0xffff0000 # CHECK: lui $5, 65535  # encoding: [0x3c,0x05,0xff,0xff]
+la $5, 0xfffe0000 # CHECK: lui $5, 65534  # encoding: [0x3c,0x05,0xff,0xfe]
+la $5, 0xc0000000 # CHECK: lui $5, 49152  # encoding: [0x3c,0x05,0xc0,0x00]
+la $5, 0x80000000 # CHECK: lui $5, 32768  # encoding: [0x3c,0x05,0x80,0x00]
+
+la $5, 0x00010001 # CHECK: lui $5, 1        # encoding: [0x3c,0x05,0x00,0x01]
+                  # CHECK: ori $5, $5, 1    # encoding: [0x34,0xa5,0x00,0x01]
+la $5, 0x00020001 # CHECK: lui $5, 2        # encoding: [0x3c,0x05,0x00,0x02]
+                  # CHECK: ori $5, $5, 1    # encoding: [0x34,0xa5,0x00,0x01]
+la $5, 0x40000001 # CHECK: lui $5, 16384    # encoding: [0x3c,0x05,0x40,0x00]
+                  # CHECK: ori $5, $5, 1    # encoding: [0x34,0xa5,0x00,0x01]
+la $5, 0x80000001 # CHECK: lui $5, 32768    # encoding: [0x3c,0x05,0x80,0x00]
+                  # CHECK: ori $5, $5, 1    # encoding: [0x34,0xa5,0x00,0x01]
+la $5, 0x00010002 # CHECK: lui $5, 1        # encoding: [0x3c,0x05,0x00,0x01]
+                  # CHECK: ori $5, $5, 2    # encoding: [0x34,0xa5,0x00,0x02]
+la $5, 0x00020002 # CHECK: lui $5, 2        # encoding: [0x3c,0x05,0x00,0x02]
+                  # CHECK: ori $5, $5, 2    # encoding: [0x34,0xa5,0x00,0x02]
+la $5, 0x40000002 # CHECK: lui $5, 16384    # encoding: [0x3c,0x05,0x40,0x00]
+                  # CHECK: ori $5, $5, 2    # encoding: [0x34,0xa5,0x00,0x02]
+la $5, 0x80000002 # CHECK: lui $5, 32768    # encoding: [0x3c,0x05,0x80,0x00]
+                  # CHECK: ori $5, $5, 2    # encoding: [0x34,0xa5,0x00,0x02]
+la $5, 0x00014000 # CHECK: lui $5, 1        # encoding: [0x3c,0x05,0x00,0x01]
+                  # CHECK: ori $5, $5, 16384    # encoding: [0x34,0xa5,0x40,0x00]
+la $5, 0x00024000 # CHECK: lui $5, 2            # encoding: [0x3c,0x05,0x00,0x02]
+                  # CHECK: ori $5, $5, 16384    # encoding: [0x34,0xa5,0x40,0x00]
+la $5, 0x40004000 # CHECK: lui $5, 16384        # encoding: [0x3c,0x05,0x40,0x00]
+                  # CHECK: ori $5, $5, 16384    # encoding: [0x34,0xa5,0x40,0x00]
+la $5, 0x80004000 # CHECK: lui $5, 32768        # encoding: [0x3c,0x05,0x80,0x00]
+                  # CHECK: ori $5, $5, 16384    # encoding: [0x34,0xa5,0x40,0x00]
+la $5, 0x00018000 # CHECK: lui $5, 1            # encoding: [0x3c,0x05,0x00,0x01]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+la $5, 0x00028000 # CHECK: lui $5, 2            # encoding: [0x3c,0x05,0x00,0x02]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+la $5, 0x40008000 # CHECK: lui $5, 16384        # encoding: [0x3c,0x05,0x40,0x00]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+la $5, 0x80008000 # CHECK: lui $5, 32768        # encoding: [0x3c,0x05,0x80,0x00]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+la $5, 0xffff4000 # CHECK: lui $5, 65535        # encoding: [0x3c,0x05,0xff,0xff]
+                  # CHECK: ori $5, $5, 16384    # encoding: [0x34,0xa5,0x40,0x00]
+la $5, 0xfffe8000 # CHECK: lui $5, 65534        # encoding: [0x3c,0x05,0xff,0xfe]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+la $5, 0xc0008000 # CHECK: lui $5, 49152        # encoding: [0x3c,0x05,0xc0,0x00]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+la $5, 0x80008000 # CHECK: lui $5, 32768        # encoding: [0x3c,0x05,0x80,0x00]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+
+la $5, 0x00000001($6) # CHECK: addiu $5, $6, 1         # encoding: [0x24,0xc5,0x00,0x01]
+la $5, 0x00000002($6) # CHECK: addiu $5, $6, 2         # encoding: [0x24,0xc5,0x00,0x02]
+la $5, 0x00004000($6) # CHECK: addiu $5, $6, 16384     # encoding: [0x24,0xc5,0x40,0x00]
+la $5, 0x00008000($6) # CHECK: ori   $5, $zero, 32768  # encoding: [0x34,0x05,0x80,0x00]
+                      # CHECK: addu $5, $5, $6         # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0xffffffff($6) # CHECK: addiu $5, $6, -1        # encoding: [0x24,0xc5,0xff,0xff]
+la $5, 0xfffffffe($6) # CHECK: addiu $5, $6, -2        # encoding: [0x24,0xc5,0xff,0xfe]
+la $5, 0xffffc000($6) # CHECK: addiu $5, $6, -16384    # encoding: [0x24,0xc5,0xc0,0x00]
+la $5, 0xffff8000($6) # CHECK: addiu $5, $6, -32768    # encoding: [0x24,0xc5,0x80,0x00]
+
+la $5, 0x00010000($6) # CHECK: lui $5, 1       # encoding: [0x3c,0x05,0x00,0x01]
+                      # CHECK: addu $5, $5, $6 # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x00020000($6) # CHECK: lui $5, 2       # encoding: [0x3c,0x05,0x00,0x02]
+                      # CHECK: addu $5, $5, $6 # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x40000000($6) # CHECK: lui $5, 16384   # encoding: [0x3c,0x05,0x40,0x00]
+                      # CHECK: addu $5, $5, $6 # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x80000000($6) # CHECK: lui $5, 32768   # encoding: [0x3c,0x05,0x80,0x00]
+                      # CHECK: addu $5, $5, $6 # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0xffff0000($6) # CHECK: lui $5, 65535   # encoding: [0x3c,0x05,0xff,0xff]
+                      # CHECK: addu $5, $5, $6 # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0xfffe0000($6) # CHECK: lui $5, 65534   # encoding: [0x3c,0x05,0xff,0xfe]
+                      # CHECK: addu $5, $5, $6 # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0xc0000000($6) # CHECK: lui $5, 49152   # encoding: [0x3c,0x05,0xc0,0x00]
+                      # CHECK: addu $5, $5, $6 # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x80000000($6) # CHECK: lui $5, 32768   # encoding: [0x3c,0x05,0x80,0x00]
+                      # CHECK: addu $5, $5, $6 # encoding: [0x00,0xa6,0x28,0x21]
+
+la $5, 0x00010001($6) # CHECK: lui $5, 1         # encoding: [0x3c,0x05,0x00,0x01]
+                      # CHECK: ori $5, $5, 1     # encoding: [0x34,0xa5,0x00,0x01]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x00020001($6) # CHECK: lui $5, 2         # encoding: [0x3c,0x05,0x00,0x02]
+                      # CHECK: ori $5, $5, 1     # encoding: [0x34,0xa5,0x00,0x01]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x40000001($6) # CHECK: lui $5, 16384     # encoding: [0x3c,0x05,0x40,0x00]
+                      # CHECK: ori $5, $5, 1     # encoding: [0x34,0xa5,0x00,0x01]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x80000001($6) # CHECK: lui $5, 32768     # encoding: [0x3c,0x05,0x80,0x00]
+                      # CHECK: ori $5, $5, 1     # encoding: [0x34,0xa5,0x00,0x01]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x00010002($6) # CHECK: lui $5, 1         # encoding: [0x3c,0x05,0x00,0x01]
+                      # CHECK: ori $5, $5, 2     # encoding: [0x34,0xa5,0x00,0x02]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x00020002($6) # CHECK: lui $5, 2         # encoding: [0x3c,0x05,0x00,0x02]
+                      # CHECK: ori $5, $5, 2     # encoding: [0x34,0xa5,0x00,0x02]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x40000002($6) # CHECK: lui $5, 16384     # encoding: [0x3c,0x05,0x40,0x00]
+                      # CHECK: ori $5, $5, 2     # encoding: [0x34,0xa5,0x00,0x02]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x80000002($6) # CHECK: lui $5, 32768     # encoding: [0x3c,0x05,0x80,0x00]
+                      # CHECK: ori $5, $5, 2     # encoding: [0x34,0xa5,0x00,0x02]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x00014000($6) # CHECK: lui $5, 1         # encoding: [0x3c,0x05,0x00,0x01]
+                      # CHECK: ori $5, $5, 16384 # encoding: [0x34,0xa5,0x40,0x00]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x00024000($6) # CHECK: lui $5, 2         # encoding: [0x3c,0x05,0x00,0x02]
+                      # CHECK: ori $5, $5, 16384 # encoding: [0x34,0xa5,0x40,0x00]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x40004000($6) # CHECK: lui $5, 16384     # encoding: [0x3c,0x05,0x40,0x00]
+                      # CHECK: ori $5, $5, 16384 # encoding: [0x34,0xa5,0x40,0x00]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x80004000($6) # CHECK: lui $5, 32768     # encoding: [0x3c,0x05,0x80,0x00]
+                      # CHECK: ori $5, $5, 16384 # encoding: [0x34,0xa5,0x40,0x00]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x00018000($6) # CHECK: lui $5, 1         # encoding: [0x3c,0x05,0x00,0x01]
+                      # CHECK: ori $5, $5, 32768 # encoding: [0x34,0xa5,0x80,0x00]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x00028000($6) # CHECK: lui $5, 2         # encoding: [0x3c,0x05,0x00,0x02]
+                      # CHECK: ori $5, $5, 32768 # encoding: [0x34,0xa5,0x80,0x00]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x40008000($6) # CHECK: lui $5, 16384     # encoding: [0x3c,0x05,0x40,0x00]
+                      # CHECK: ori $5, $5, 32768 # encoding: [0x34,0xa5,0x80,0x00]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x80008000($6) # CHECK: lui $5, 32768     # encoding: [0x3c,0x05,0x80,0x00]
+                      # CHECK: ori $5, $5, 32768 # encoding: [0x34,0xa5,0x80,0x00]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0xffff4000($6) # CHECK: lui $5, 65535     # encoding: [0x3c,0x05,0xff,0xff]
+                      # CHECK: ori $5, $5, 16384 # encoding: [0x34,0xa5,0x40,0x00]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0xfffe8000($6) # CHECK: lui $5, 65534     # encoding: [0x3c,0x05,0xff,0xfe]
+                      # CHECK: ori $5, $5, 32768 # encoding: [0x34,0xa5,0x80,0x00]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0xc0008000($6) # CHECK: lui $5, 49152     # encoding: [0x3c,0x05,0xc0,0x00]
+                      # CHECK: ori $5, $5, 32768 # encoding: [0x34,0xa5,0x80,0x00]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+la $5, 0x80008000($6) # CHECK: lui $5, 32768     # encoding: [0x3c,0x05,0x80,0x00]
+                      # CHECK: ori $5, $5, 32768 # encoding: [0x34,0xa5,0x80,0x00]
+                      # CHECK: addu $5, $5, $6   # encoding: [0x00,0xa6,0x28,0x21]
+
+la $6, 0x00000001($6) # CHECK: addiu $6, $6, 1         # encoding: [0x24,0xc6,0x00,0x01]
+la $6, 0x00000002($6) # CHECK: addiu $6, $6, 2         # encoding: [0x24,0xc6,0x00,0x02]
+la $6, 0x00004000($6) # CHECK: addiu $6, $6, 16384     # encoding: [0x24,0xc6,0x40,0x00]
+la $6, 0x00008000($6) # CHECK: ori   $1, $zero, 32768  # encoding: [0x34,0x01,0x80,0x00]
+                      # CHECK: addu $6, $1, $6         # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0xffffffff($6) # CHECK: addiu $6, $6, -1        # encoding: [0x24,0xc6,0xff,0xff]
+la $6, 0xfffffffe($6) # CHECK: addiu $6, $6, -2        # encoding: [0x24,0xc6,0xff,0xfe]
+la $6, 0xffffc000($6) # CHECK: addiu $6, $6, -16384    # encoding: [0x24,0xc6,0xc0,0x00]
+la $6, 0xffff8000($6) # CHECK: addiu $6, $6, -32768    # encoding: [0x24,0xc6,0x80,0x00]
+
+la $6, 0x00010000($6) # CHECK: lui $1, 1       # encoding: [0x3c,0x01,0x00,0x01]
+                      # CHECK: addu $6, $1, $6 # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x00020000($6) # CHECK: lui $1, 2       # encoding: [0x3c,0x01,0x00,0x02]
+                      # CHECK: addu $6, $1, $6 # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x40000000($6) # CHECK: lui $1, 16384   # encoding: [0x3c,0x01,0x40,0x00]
+                      # CHECK: addu $6, $1, $6 # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x80000000($6) # CHECK: lui $1, 32768   # encoding: [0x3c,0x01,0x80,0x00]
+                      # CHECK: addu $6, $1, $6 # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0xffff0000($6) # CHECK: lui $1, 65535   # encoding: [0x3c,0x01,0xff,0xff]
+                      # CHECK: addu $6, $1, $6 # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0xfffe0000($6) # CHECK: lui $1, 65534   # encoding: [0x3c,0x01,0xff,0xfe]
+                      # CHECK: addu $6, $1, $6 # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0xc0000000($6) # CHECK: lui $1, 49152   # encoding: [0x3c,0x01,0xc0,0x00]
+                      # CHECK: addu $6, $1, $6 # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x80000000($6) # CHECK: lui $1, 32768   # encoding: [0x3c,0x01,0x80,0x00]
+                      # CHECK: addu $6, $1, $6 # encoding: [0x00,0x26,0x30,0x21]
+
+la $6, 0x00010001($6) # CHECK: lui $1, 1         # encoding: [0x3c,0x01,0x00,0x01]
+                      # CHECK: ori $1, $1, 1     # encoding: [0x34,0x21,0x00,0x01]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x00020001($6) # CHECK: lui $1, 2         # encoding: [0x3c,0x01,0x00,0x02]
+                      # CHECK: ori $1, $1, 1     # encoding: [0x34,0x21,0x00,0x01]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x40000001($6) # CHECK: lui $1, 16384     # encoding: [0x3c,0x01,0x40,0x00]
+                      # CHECK: ori $1, $1, 1     # encoding: [0x34,0x21,0x00,0x01]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x80000001($6) # CHECK: lui $1, 32768     # encoding: [0x3c,0x01,0x80,0x00]
+                      # CHECK: ori $1, $1, 1     # encoding: [0x34,0x21,0x00,0x01]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x00010002($6) # CHECK: lui $1, 1         # encoding: [0x3c,0x01,0x00,0x01]
+                      # CHECK: ori $1, $1, 2     # encoding: [0x34,0x21,0x00,0x02]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x00020002($6) # CHECK: lui $1, 2         # encoding: [0x3c,0x01,0x00,0x02]
+                      # CHECK: ori $1, $1, 2     # encoding: [0x34,0x21,0x00,0x02]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x40000002($6) # CHECK: lui $1, 16384     # encoding: [0x3c,0x01,0x40,0x00]
+                      # CHECK: ori $1, $1, 2     # encoding: [0x34,0x21,0x00,0x02]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x80000002($6) # CHECK: lui $1, 32768     # encoding: [0x3c,0x01,0x80,0x00]
+                      # CHECK: ori $1, $1, 2     # encoding: [0x34,0x21,0x00,0x02]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x00014000($6) # CHECK: lui $1, 1         # encoding: [0x3c,0x01,0x00,0x01]
+                      # CHECK: ori $1, $1, 16384 # encoding: [0x34,0x21,0x40,0x00]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x00024000($6) # CHECK: lui $1, 2         # encoding: [0x3c,0x01,0x00,0x02]
+                      # CHECK: ori $1, $1, 16384 # encoding: [0x34,0x21,0x40,0x00]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x40004000($6) # CHECK: lui $1, 16384     # encoding: [0x3c,0x01,0x40,0x00]
+                      # CHECK: ori $1, $1, 16384 # encoding: [0x34,0x21,0x40,0x00]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x80004000($6) # CHECK: lui $1, 32768     # encoding: [0x3c,0x01,0x80,0x00]
+                      # CHECK: ori $1, $1, 16384 # encoding: [0x34,0x21,0x40,0x00]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x00018000($6) # CHECK: lui $1, 1         # encoding: [0x3c,0x01,0x00,0x01]
+                      # CHECK: ori $1, $1, 32768 # encoding: [0x34,0x21,0x80,0x00]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x00028000($6) # CHECK: lui $1, 2         # encoding: [0x3c,0x01,0x00,0x02]
+                      # CHECK: ori $1, $1, 32768 # encoding: [0x34,0x21,0x80,0x00]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x40008000($6) # CHECK: lui $1, 16384     # encoding: [0x3c,0x01,0x40,0x00]
+                      # CHECK: ori $1, $1, 32768 # encoding: [0x34,0x21,0x80,0x00]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x80008000($6) # CHECK: lui $1, 32768     # encoding: [0x3c,0x01,0x80,0x00]
+                      # CHECK: ori $1, $1, 32768 # encoding: [0x34,0x21,0x80,0x00]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0xffff4000($6) # CHECK: lui $1, 65535     # encoding: [0x3c,0x01,0xff,0xff]
+                      # CHECK: ori $1, $1, 16384 # encoding: [0x34,0x21,0x40,0x00]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0xfffe8000($6) # CHECK: lui $1, 65534     # encoding: [0x3c,0x01,0xff,0xfe]
+                      # CHECK: ori $1, $1, 32768 # encoding: [0x34,0x21,0x80,0x00]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0xc0008000($6) # CHECK: lui $1, 49152     # encoding: [0x3c,0x01,0xc0,0x00]
+                      # CHECK: ori $1, $1, 32768 # encoding: [0x34,0x21,0x80,0x00]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+la $6, 0x80008000($6) # CHECK: lui $1, 32768     # encoding: [0x3c,0x01,0x80,0x00]
+                      # CHECK: ori $1, $1, 32768 # encoding: [0x34,0x21,0x80,0x00]
+                      # CHECK: addu $6, $1, $6   # encoding: [0x00,0x26,0x30,0x21]
+
+la $5, symbol         # CHECK: lui $5, %hi(symbol)       # encoding: [0x3c,0x05,A,A]
+                      # CHECK:                           #   fixup A - offset: 0, value: symbol@ABS_HI, kind: fixup_Mips_HI16
+                      # CHECK: addiu $5, $5, %lo(symbol) # encoding: [0x24,0xa5,A,A]
+                      # CHECK:                           #   fixup A - offset: 0, value: symbol@ABS_LO, kind: fixup_Mips_LO16
+la $5, symbol($6)     # CHECK: lui $5, %hi(symbol)       # encoding: [0x3c,0x05,A,A]
+                      # CHECK:                           #   fixup A - offset: 0, value: symbol@ABS_HI, kind: fixup_Mips_HI16
+                      # CHECK: addiu $5, $5, %lo(symbol) # encoding: [0x24,0xa5,A,A]
+                      # CHECK:                           #   fixup A - offset: 0, value: symbol@ABS_LO, kind: fixup_Mips_LO16
+                      # CHECK: addu $5, $5, $6           # encoding: [0x00,0xa6,0x28,0x21]
+la $6, symbol($6)     # CHECK: lui $1, %hi(symbol)       # encoding: [0x3c,0x01,A,A]
+                      # CHECK:                           #   fixup A - offset: 0, value: symbol@ABS_HI, kind: fixup_Mips_HI16
+                      # CHECK: addiu $1, $1, %lo(symbol) # encoding: [0x24,0x21,A,A]
+                      # CHECK:                           #   fixup A - offset: 0, value: symbol@ABS_LO, kind: fixup_Mips_LO16
+                      # CHECK: addu $6, $1, $6           # encoding: [0x00,0x26,0x30,0x21]
+la $5, 1f             # CHECK: lui $5, %hi($tmp0)        # encoding: [0x3c,0x05,A,A]
+                      # CHECK:                           #   fixup A - offset: 0, value: ($tmp0)@ABS_HI, kind: fixup_Mips_HI16
+                      # CHECK: addiu $5, $5, %lo($tmp0)  # encoding: [0x24,0xa5,A,A]
+                      # CHECK:                           #   fixup A - offset: 0, value: ($tmp0)@ABS_LO, kind: fixup_Mips_LO16
+1:
diff --git a/test/MC/Mips/macro-li-bad.s b/test/MC/Mips/macro-li-bad.s
new file mode 100644
index 000000000000..8fe622066e9c
--- /dev/null
+++ b/test/MC/Mips/macro-li-bad.s
@@ -0,0 +1,11 @@
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1 --check-prefix=32-BIT
+# RUN: not llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n32 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=64-BIT
+# RUN: not llvm-mc %s -arch=mips64 -mcpu=mips64 -target-abi n64 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=64-BIT
+
+  .text
+  li $5, 0x100000000
+  # 32-BIT: :[[@LINE-1]]:3: error: instruction requires a 32-bit immediate
+  # 64-BIT: :[[@LINE-2]]:3: error: instruction requires a 32-bit immediate
diff --git a/test/MC/Mips/macro-li.s b/test/MC/Mips/macro-li.s
new file mode 100644
index 000000000000..88e013a854e2
--- /dev/null
+++ b/test/MC/Mips/macro-li.s
@@ -0,0 +1,67 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r2 | \
+# RUN:   FileCheck %s
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 | \
+# RUN:   FileCheck %s
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r2 | \
+# RUN:   FileCheck %s
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 | \
+# RUN:   FileCheck %s
+
+li $5, 0x00000001 # CHECK: addiu $5, $zero, 1      # encoding: [0x24,0x05,0x00,0x01]
+li $5, 0x00000002 # CHECK: addiu $5, $zero, 2      # encoding: [0x24,0x05,0x00,0x02]
+li $5, 0x00004000 # CHECK: addiu $5, $zero, 16384  # encoding: [0x24,0x05,0x40,0x00]
+li $5, 0x00008000 # CHECK: ori   $5, $zero, 32768  # encoding: [0x34,0x05,0x80,0x00]
+li $5, 0xffffffff # CHECK: addiu $5, $zero, -1     # encoding: [0x24,0x05,0xff,0xff]
+li $5, 0xfffffffe # CHECK: addiu $5, $zero, -2     # encoding: [0x24,0x05,0xff,0xfe]
+li $5, 0xffffc000 # CHECK: addiu $5, $zero, -16384 # encoding: [0x24,0x05,0xc0,0x00]
+li $5, 0xffff8000 # CHECK: addiu $5, $zero, -32768 # encoding: [0x24,0x05,0x80,0x00]
+
+li $5, 0x00010000 # CHECK: lui $5, 1      # encoding: [0x3c,0x05,0x00,0x01]
+li $5, 0x00020000 # CHECK: lui $5, 2      # encoding: [0x3c,0x05,0x00,0x02]
+li $5, 0x40000000 # CHECK: lui $5, 16384  # encoding: [0x3c,0x05,0x40,0x00]
+li $5, 0x80000000 # CHECK: lui $5, 32768  # encoding: [0x3c,0x05,0x80,0x00]
+li $5, 0xffff0000 # CHECK: lui $5, 65535  # encoding: [0x3c,0x05,0xff,0xff]
+li $5, 0xfffe0000 # CHECK: lui $5, 65534  # encoding: [0x3c,0x05,0xff,0xfe]
+li $5, 0xc0000000 # CHECK: lui $5, 49152  # encoding: [0x3c,0x05,0xc0,0x00]
+li $5, 0x80000000 # CHECK: lui $5, 32768  # encoding: [0x3c,0x05,0x80,0x00]
+
+li $5, 0x00010001 # CHECK: lui $5, 1        # encoding: [0x3c,0x05,0x00,0x01]
+                  # CHECK: ori $5, $5, 1    # encoding: [0x34,0xa5,0x00,0x01]
+li $5, 0x00020001 # CHECK: lui $5, 2        # encoding: [0x3c,0x05,0x00,0x02]
+                  # CHECK: ori $5, $5, 1    # encoding: [0x34,0xa5,0x00,0x01]
+li $5, 0x40000001 # CHECK: lui $5, 16384    # encoding: [0x3c,0x05,0x40,0x00]
+                  # CHECK: ori $5, $5, 1    # encoding: [0x34,0xa5,0x00,0x01]
+li $5, 0x80000001 # CHECK: lui $5, 32768    # encoding: [0x3c,0x05,0x80,0x00]
+                  # CHECK: ori $5, $5, 1    # encoding: [0x34,0xa5,0x00,0x01]
+li $5, 0x00010002 # CHECK: lui $5, 1        # encoding: [0x3c,0x05,0x00,0x01]
+                  # CHECK: ori $5, $5, 2    # encoding: [0x34,0xa5,0x00,0x02]
+li $5, 0x00020002 # CHECK: lui $5, 2        # encoding: [0x3c,0x05,0x00,0x02]
+                  # CHECK: ori $5, $5, 2    # encoding: [0x34,0xa5,0x00,0x02]
+li $5, 0x40000002 # CHECK: lui $5, 16384    # encoding: [0x3c,0x05,0x40,0x00]
+                  # CHECK: ori $5, $5, 2    # encoding: [0x34,0xa5,0x00,0x02]
+li $5, 0x80000002 # CHECK: lui $5, 32768    # encoding: [0x3c,0x05,0x80,0x00]
+                  # CHECK: ori $5, $5, 2    # encoding: [0x34,0xa5,0x00,0x02]
+li $5, 0x00014000 # CHECK: lui $5, 1        # encoding: [0x3c,0x05,0x00,0x01]
+                  # CHECK: ori $5, $5, 16384    # encoding: [0x34,0xa5,0x40,0x00]
+li $5, 0x00024000 # CHECK: lui $5, 2            # encoding: [0x3c,0x05,0x00,0x02]
+                  # CHECK: ori $5, $5, 16384    # encoding: [0x34,0xa5,0x40,0x00]
+li $5, 0x40004000 # CHECK: lui $5, 16384        # encoding: [0x3c,0x05,0x40,0x00]
+                  # CHECK: ori $5, $5, 16384    # encoding: [0x34,0xa5,0x40,0x00]
+li $5, 0x80004000 # CHECK: lui $5, 32768        # encoding: [0x3c,0x05,0x80,0x00]
+                  # CHECK: ori $5, $5, 16384    # encoding: [0x34,0xa5,0x40,0x00]
+li $5, 0x00018000 # CHECK: lui $5, 1            # encoding: [0x3c,0x05,0x00,0x01]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+li $5, 0x00028000 # CHECK: lui $5, 2            # encoding: [0x3c,0x05,0x00,0x02]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+li $5, 0x40008000 # CHECK: lui $5, 16384        # encoding: [0x3c,0x05,0x40,0x00]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+li $5, 0x80008000 # CHECK: lui $5, 32768        # encoding: [0x3c,0x05,0x80,0x00]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+li $5, 0xffff4000 # CHECK: lui $5, 65535        # encoding: [0x3c,0x05,0xff,0xff]
+                  # CHECK: ori $5, $5, 16384    # encoding: [0x34,0xa5,0x40,0x00]
+li $5, 0xfffe8000 # CHECK: lui $5, 65534        # encoding: [0x3c,0x05,0xff,0xfe]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+li $5, 0xc0008000 # CHECK: lui $5, 49152        # encoding: [0x3c,0x05,0xc0,0x00]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
+li $5, 0x80008000 # CHECK: lui $5, 32768        # encoding: [0x3c,0x05,0x80,0x00]
+                  # CHECK: ori $5, $5, 32768    # encoding: [0x34,0xa5,0x80,0x00]
diff --git a/test/MC/Mips/micromips-expansions.s b/test/MC/Mips/micromips-expansions.s
index 5024850abc4d..48c73f34e7e8 100644
--- a/test/MC/Mips/micromips-expansions.s
+++ b/test/MC/Mips/micromips-expansions.s
@@ -5,14 +5,14 @@
 #------------------------------------------------------------------------------
 # Load immediate instructions
 #------------------------------------------------------------------------------
-# CHECK: ori   $5, $zero, 123        # encoding: [0xa0,0x50,0x7b,0x00]
+# CHECK: addiu $5, $zero, 123        # encoding: [0xa0,0x30,0x7b,0x00]
 # CHECK: addiu $6, $zero, -2345      # encoding: [0xc0,0x30,0xd7,0xf6]
 # CHECK: lui   $7, 1                 # encoding: [0xa7,0x41,0x01,0x00]
 # CHECK: ori   $7, $7, 2             # encoding: [0xe7,0x50,0x02,0x00]
-# CHECK: ori   $4, $zero, 20         # encoding: [0x80,0x50,0x14,0x00]
+# CHECK: addiu $4, $zero, 20         # encoding: [0x80,0x30,0x14,0x00]
 # CHECK: lui   $7, 1                 # encoding: [0xa7,0x41,0x01,0x00]
 # CHECK: ori   $7, $7, 2             # encoding: [0xe7,0x50,0x02,0x00]
-# CHECK: ori   $4, $5, 20            # encoding: [0x85,0x50,0x14,0x00]
+# CHECK: addiu $4, $5, 20            # encoding: [0x85,0x30,0x14,0x00]
 # CHECK: lui   $7, 1                 # encoding: [0xa7,0x41,0x01,0x00]
 # CHECK: ori   $7, $7, 2             # encoding: [0xe7,0x50,0x02,0x00]
 # CHECK: addu  $7, $7, $8            # encoding: [0x07,0x01,0x50,0x39]
diff --git a/test/MC/Mips/mips-expansions-bad.s b/test/MC/Mips/mips-expansions-bad.s
index 416cb5f3ba69..cd74f7d4aa88 100644
--- a/test/MC/Mips/mips-expansions-bad.s
+++ b/test/MC/Mips/mips-expansions-bad.s
@@ -6,18 +6,6 @@
 # RUN:   FileCheck %s --check-prefix=64-BIT --check-prefix=N64-ONLY
 
   .text
-  li $5, 0x100000000
-  # 32-BIT: :[[@LINE-1]]:3: error: instruction requires a 32-bit immediate
-  # 64-BIT: :[[@LINE-2]]:3: error: instruction requires a 32-bit immediate
-  la $5, 0x100000000
-  # 32-BIT: :[[@LINE-1]]:3: error: instruction requires a 32-bit immediate
-  # 64-BIT: :[[@LINE-2]]:3: error: instruction requires a 32-bit immediate
-  la $5, 0x100000000($6)
-  # 32-BIT: :[[@LINE-1]]:3: error: instruction requires a 32-bit immediate
-  # 64-BIT: :[[@LINE-2]]:3: error: instruction requires a 32-bit immediate
-  la $5, symbol
-  # N64-ONLY: :[[@LINE-1]]:3: warning: instruction loads the 32-bit address of a 64-bit symbol
-  # N32-ONLY-NOT: :[[@LINE-2]]:3: warning: instruction loads the 32-bit address of a 64-bit symbol
   dli $5, 1
   # 32-BIT: :[[@LINE-1]]:3: error: instruction requires a 64-bit architecture
   bne $2, 0x100010001, 1332
diff --git a/test/MC/Mips/mips-expansions.s b/test/MC/Mips/mips-expansions.s
index 55de6d046349..93c6b7cd75a8 100644
--- a/test/MC/Mips/mips-expansions.s
+++ b/test/MC/Mips/mips-expansions.s
@@ -5,64 +5,13 @@
 
 # Check that the IAS expands macro instructions in the same way as GAS.
 
-# Load immediate, done by MipsAsmParser::expandLoadImm():
-  li $5, 123
-# CHECK-LE:     ori     $5, $zero, 123   # encoding: [0x7b,0x00,0x05,0x34]
-  li $6, -2345
-# CHECK-LE:     addiu   $6, $zero, -2345 # encoding: [0xd7,0xf6,0x06,0x24]
-  li $7, 65538
-# CHECK-LE:     lui     $7, 1            # encoding: [0x01,0x00,0x07,0x3c]
-# CHECK-LE:     ori     $7, $7, 2        # encoding: [0x02,0x00,0xe7,0x34]
-  li $8, ~7
-# CHECK-LE:     addiu   $8, $zero, -8    # encoding: [0xf8,0xff,0x08,0x24]
-  li $9, 0x10000
-# CHECK-LE:     lui     $9, 1            # encoding: [0x01,0x00,0x09,0x3c]
-# CHECK-LE-NOT: ori $9, $9, 0            # encoding: [0x00,0x00,0x29,0x35]
-  li $10, ~(0x101010)
-# CHECK-LE:     lui     $10, 65519       # encoding: [0xef,0xff,0x0a,0x3c]
-# CHECK-LE:     ori     $10, $10, 61423  # encoding: [0xef,0xef,0x4a,0x35]
-
 # Load address, done by MipsAsmParser::expandLoadAddressReg()
 # and MipsAsmParser::expandLoadAddressImm():
-  la $4, 20
-# CHECK-LE: ori     $4, $zero, 20       # encoding: [0x14,0x00,0x04,0x34]
-  la $7, 65538
-# CHECK-LE: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
-# CHECK-LE: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
-  la $4, 20($5)
-# CHECK-LE: ori     $4, $5, 20          # encoding: [0x14,0x00,0xa4,0x34]
-  la $7, 65538($8)
-# CHECK-LE: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
-# CHECK-LE: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
-# CHECK-LE: addu    $7, $7, $8          # encoding: [0x21,0x38,0xe8,0x00]
   la $8, 1f
 # CHECK-LE: lui     $8, %hi($tmp0)      # encoding: [A,A,0x08,0x3c]
 # CHECK-LE:                             #   fixup A - offset: 0, value: ($tmp0)@ABS_HI, kind: fixup_Mips_HI16
-# CHECK-LE: ori     $8, $8, %lo($tmp0)  # encoding: [A,A,0x08,0x35]
+# CHECK-LE: addiu   $8, $8, %lo($tmp0)  # encoding: [A,A,0x08,0x25]
 # CHECK-LE:                             #   fixup A - offset: 0, value: ($tmp0)@ABS_LO, kind: fixup_Mips_LO16
-  la $8, symbol
-# CHECK-LE: lui     $8, %hi(symbol)     # encoding: [A,A,0x08,0x3c]
-# CHECK-LE:                             #   fixup A - offset: 0, value: symbol@ABS_HI, kind: fixup_Mips_HI16
-# CHECK-LE: ori     $8, $8, %lo(symbol) # encoding: [A,A,0x08,0x35]
-# CHECK-LE:                             #   fixup A - offset: 0, value: symbol@ABS_LO, kind: fixup_Mips_LO16
-  la $8, symbol($9)
-# CHECK-LE: lui  $8, %hi(symbol)        # encoding: [A,A,0x08,0x3c]
-# CHECK-LE:                             #   fixup A - offset: 0, value: symbol@ABS_HI, kind: fixup_Mips_HI16
-# CHECK-LE: ori  $8, $8, %lo(symbol)    # encoding: [A,A,0x08,0x35]
-# CHECK-LE:                             #   fixup A - offset: 0, value: symbol@ABS_LO, kind: fixup_Mips_LO16
-# CHECK-LE: addu $8, $8, $9             # encoding: [0x21,0x40,0x09,0x01]
-  la $8, symbol($8)
-# CHECK-LE: lui  $1, %hi(symbol)        # encoding: [A,A,0x01,0x3c]
-# CHECK-LE:                             #   fixup A - offset: 0, value: symbol@ABS_HI, kind: fixup_Mips_HI16
-# CHECK-LE: ori  $1, $1, %lo(symbol)    # encoding: [A,A,0x21,0x34]
-# CHECK-LE:                             #   fixup A - offset: 0, value: symbol@ABS_LO, kind: fixup_Mips_LO16
-# CHECK-LE: addu $8, $1, $8             # encoding: [0x21,0x40,0x28,0x00]
-  la $8, 20($8)
-# CHECK-LE: ori  $8, $8, 20             # encoding: [0x14,0x00,0x08,0x35]
-  la $8, 65538($8)
-# CHECK-LE: lui  $1, 1                  # encoding: [0x01,0x00,0x01,0x3c]
-# CHECK-LE: ori  $1, $1, 2              # encoding: [0x02,0x00,0x21,0x34]
-# CHECK-LE: addu $8, $1, $8             # encoding: [0x21,0x40,0x28,0x00]
 
 # LW/SW and LDC1/SDC1 of symbol address, done by MipsAsmParser::expandMemInst():
   .set noat
@@ -126,7 +75,7 @@
 # CHECK-LE: nop                     # encoding: [0x00,0x00,0x00,0x00]
 
   bne $2, 123, 1332
-# CHECK-LE: ori   $1, $zero, 123    # encoding: [0x7b,0x00,0x01,0x34]
+# CHECK-LE: addiu $1, $zero, 123    # encoding: [0x7b,0x00,0x01,0x24]
 # CHECK-LE: bne   $2, $1, 1332      # encoding: [0x4d,0x01,0x41,0x14]
 # CHECK-LE: nop                     # encoding: [0x00,0x00,0x00,0x00]
 
@@ -157,7 +106,7 @@
 # CHECK-LE: nop                     # encoding: [0x00,0x00,0x00,0x00]
 
   beq $2, 123, 1332
-# CHECK-LE: ori   $1, $zero, 123    # encoding: [0x7b,0x00,0x01,0x34]
+# CHECK-LE: addiu $1, $zero, 123    # encoding: [0x7b,0x00,0x01,0x24]
 # CHECK-LE: beq   $2, $1, 1332      # encoding: [0x4d,0x01,0x41,0x10]
 # CHECK-LE: nop                     # encoding: [0x00,0x00,0x00,0x00]
 
@@ -266,16 +215,16 @@
 # CHECK-LE: or   $8, $8, $1        # encoding: [0x25,0x40,0x01,0x01]
 
   ulhu $8, 32767
-# CHECK-BE: ori  $1, $zero, 32767  # encoding: [0x34,0x01,0x7f,0xff]
-# CHECK-BE: lbu  $8, 0($1)         # encoding: [0x90,0x28,0x00,0x00]
-# CHECK-BE: lbu  $1, 1($1)         # encoding: [0x90,0x21,0x00,0x01]
-# CHECK-BE: sll  $8, $8, 8         # encoding: [0x00,0x08,0x42,0x00]
-# CHECK-BE: or   $8, $8, $1        # encoding: [0x01,0x01,0x40,0x25]
-# CHECK-LE: ori  $1, $zero, 32767  # encoding: [0xff,0x7f,0x01,0x34]
-# CHECK-LE: lbu  $8, 1($1)         # encoding: [0x01,0x00,0x28,0x90]
-# CHECK-LE: lbu  $1, 0($1)         # encoding: [0x00,0x00,0x21,0x90]
-# CHECK-LE: sll  $8, $8, 8         # encoding: [0x00,0x42,0x08,0x00]
-# CHECK-LE: or   $8, $8, $1        # encoding: [0x25,0x40,0x01,0x01]
+# CHECK-BE: addiu $1, $zero, 32767  # encoding: [0x24,0x01,0x7f,0xff]
+# CHECK-BE: lbu  $8, 0($1)          # encoding: [0x90,0x28,0x00,0x00]
+# CHECK-BE: lbu  $1, 1($1)          # encoding: [0x90,0x21,0x00,0x01]
+# CHECK-BE: sll  $8, $8, 8          # encoding: [0x00,0x08,0x42,0x00]
+# CHECK-BE: or   $8, $8, $1         # encoding: [0x01,0x01,0x40,0x25]
+# CHECK-LE: addiu $1, $zero, 32767  # encoding: [0xff,0x7f,0x01,0x24]
+# CHECK-LE: lbu  $8, 1($1)          # encoding: [0x01,0x00,0x28,0x90]
+# CHECK-LE: lbu  $1, 0($1)          # encoding: [0x00,0x00,0x21,0x90]
+# CHECK-LE: sll  $8, $8, 8          # encoding: [0x00,0x42,0x08,0x00]
+# CHECK-LE: or   $8, $8, $1         # encoding: [0x25,0x40,0x01,0x01]
 
 # Test ULHU with immediate offset and a source register operand.
   ulhu $8, 0($9)
@@ -369,13 +318,13 @@
 # CHECK-LE: or   $8, $8, $1        # encoding: [0x25,0x40,0x01,0x01]
 
   ulhu $8, 32767($9)
-# CHECK-BE: ori  $1, $zero, 32767  # encoding: [0x34,0x01,0x7f,0xff]
+# CHECK-BE: addiu $1, $zero, 32767 # encoding: [0x24,0x01,0x7f,0xff]
 # CHECK-BE: addu $1, $1, $9        # encoding: [0x00,0x29,0x08,0x21]
 # CHECK-BE: lbu  $8, 0($1)         # encoding: [0x90,0x28,0x00,0x00]
 # CHECK-BE: lbu  $1, 1($1)         # encoding: [0x90,0x21,0x00,0x01]
 # CHECK-BE: sll  $8, $8, 8         # encoding: [0x00,0x08,0x42,0x00]
 # CHECK-BE: or   $8, $8, $1        # encoding: [0x01,0x01,0x40,0x25]
-# CHECK-LE: ori  $1, $zero, 32767  # encoding: [0xff,0x7f,0x01,0x34]
+# CHECK-LE: addiu $1, $zero, 32767 # encoding: [0xff,0x7f,0x01,0x24]
 # CHECK-LE: addu $1, $1, $9        # encoding: [0x21,0x08,0x29,0x00]
 # CHECK-LE: lbu  $8, 1($1)         # encoding: [0x01,0x00,0x28,0x90]
 # CHECK-LE: lbu  $1, 0($1)         # encoding: [0x00,0x00,0x21,0x90]
@@ -438,10 +387,10 @@
 # CHECK-LE: lwr $8, 0($1)          # encoding: [0x00,0x00,0x28,0x98]
 
   ulw $8, 32765
-# CHECK-BE: ori  $1, $zero, 32765  # encoding: [0x34,0x01,0x7f,0xfd]
+# CHECK-BE: addiu $1, $zero, 32765 # encoding: [0x24,0x01,0x7f,0xfd]
 # CHECK-BE: lwl  $8, 0($1)         # encoding: [0x88,0x28,0x00,0x00]
 # CHECK-BE: lwr  $8, 3($1)         # encoding: [0x98,0x28,0x00,0x03]
-# CHECK-LE: ori $1, $zero, 32765   # encoding: [0xfd,0x7f,0x01,0x34]
+# CHECK-LE: addiu $1, $zero, 32765 # encoding: [0xfd,0x7f,0x01,0x24]
 # CHECK-LE: lwl $8, 3($1)          # encoding: [0x03,0x00,0x28,0x88]
 # CHECK-LE: lwr $8, 0($1)          # encoding: [0x00,0x00,0x28,0x98]
 
@@ -509,11 +458,11 @@
 # CHECK-LE: lwr  $8, 0($1)         # encoding: [0x00,0x00,0x28,0x98]
 
   ulw $8, 32765($9)
-# CHECK-BE: ori  $1, $zero, 32765  # encoding: [0x34,0x01,0x7f,0xfd]
+# CHECK-BE: addiu $1, $zero, 32765 # encoding: [0x24,0x01,0x7f,0xfd]
 # CHECK-BE: addu $1, $1, $9        # encoding: [0x00,0x29,0x08,0x21]
 # CHECK-BE: lwl  $8, 0($1)         # encoding: [0x88,0x28,0x00,0x00]
 # CHECK-BE: lwr  $8, 3($1)         # encoding: [0x98,0x28,0x00,0x03]
-# CHECK-LE: ori  $1, $zero, 32765  # encoding: [0xfd,0x7f,0x01,0x34]
+# CHECK-LE: addiu $1, $zero, 32765 # encoding: [0xfd,0x7f,0x01,0x24]
 # CHECK-LE: addu $1, $1, $9        # encoding: [0x21,0x08,0x29,0x00]
 # CHECK-LE: lwl  $8, 3($1)         # encoding: [0x03,0x00,0x28,0x88]
 # CHECK-LE: lwr  $8, 0($1)         # encoding: [0x00,0x00,0x28,0x98]
diff --git a/test/MC/Mips/mips64-expansions.s b/test/MC/Mips/mips64-expansions.s
index a66a520a2117..b8f1e7a3e87e 100644
--- a/test/MC/Mips/mips64-expansions.s
+++ b/test/MC/Mips/mips64-expansions.s
@@ -4,7 +4,7 @@
 
 # Immediate is <= 32 bits.
   dli $5, 123
-# CHECK:     ori   $5, $zero, 123   # encoding: [0x7b,0x00,0x05,0x34]
+# CHECK:     addiu $5, $zero, 123   # encoding: [0x7b,0x00,0x05,0x24]
 
   dli $6, -2345
 # CHECK:     addiu $6, $zero, -2345 # encoding: [0xd7,0xf6,0x06,0x24]
diff --git a/test/MC/Sparc/sparc-alu-instructions.s b/test/MC/Sparc/sparc-alu-instructions.s
index e2e5ef867252..98caf1d6d673 100644
--- a/test/MC/Sparc/sparc-alu-instructions.s
+++ b/test/MC/Sparc/sparc-alu-instructions.s
@@ -76,8 +76,8 @@
         ! CHECK: mov 255, %g3     ! encoding: [0x86,0x10,0x20,0xff]
         mov 0xff, %g3
 
-        ! CHECK: restore              ! encoding: [0x81,0xe8,0x00,0x00]
-        restore %g0, %g0, %g0
+        ! CHECK: restore %g0, %g0, %g1           ! encoding: [0x83,0xe8,0x00,0x00]
+        restore %g0, %g0, %g1
 
         ! CHECK: addx %g2, %g1, %g3              ! encoding: [0x86,0x40,0x80,0x01]
         addx %g2, %g1, %g3
diff --git a/test/MC/Sparc/sparc-mem-instructions.s b/test/MC/Sparc/sparc-mem-instructions.s
index ba4c0f2d1048..c10c8781fd88 100644
--- a/test/MC/Sparc/sparc-mem-instructions.s
+++ b/test/MC/Sparc/sparc-mem-instructions.s
@@ -72,3 +72,13 @@
         st %o2, [%g1]
         ! CHECK: sta %o2, [%i0+%l6] 131 ! encoding: [0xd4,0xa6,0x10,0x76]
         sta %o2, [%i0 + %l6] 131
+
+        ! CHECK:  flush %g1+%g2         ! encoding: [0x81,0xd8,0x40,0x02]
+        flush %g1 + %g2
+        ! CHECK:  flush %g1+8           ! encoding: [0x81,0xd8,0x60,0x08]
+        flush %g1 + 8
+        ! CHECK:  flush %g1             ! encoding: [0x81,0xd8,0x40,0x00]
+        flush %g1
+        ! Not specified in manual, but accepted by gas.
+        ! CHECK:  flush %g0             ! encoding: [0x81,0xd8,0x00,0x00]
+        flush
diff --git a/test/MC/Sparc/sparc-synthetic-instructions.s b/test/MC/Sparc/sparc-synthetic-instructions.s
index 5b5a1a77db98..09fd30c09e28 100644
--- a/test/MC/Sparc/sparc-synthetic-instructions.s
+++ b/test/MC/Sparc/sparc-synthetic-instructions.s
@@ -2,6 +2,26 @@
 ! RUN: llvm-mc %s -arch=sparcv9 -show-encoding | FileCheck %s
 
 ! Section A.3 Synthetic Instructions
+        ! CHECK: cmp %g1, %g2                     ! encoding: [0x80,0xa0,0x40,0x02]
+        cmp %g1, %g2
+        ! CHECK: cmp %g1, 5                       ! encoding: [0x80,0xa0,0x60,0x05]
+        cmp %g1, 5
+
+        ! jmp and call are tested in sparc-ctrl-instructions.
+
+        ! CHECK: tst %g1                          ! encoding: [0x80,0x90,0x40,0x00]
+        tst %g1
+
+        ! CHECK: ret                              ! encoding: [0x81,0xc7,0xe0,0x08]
+        ret
+        ! CHECK: retl                             ! encoding: [0x81,0xc3,0xe0,0x08]
+        retl
+
+        ! CHECK: restore                          ! encoding: [0x81,0xe8,0x00,0x00]
+        restore
+        ! CHECK: save                             ! encoding: [0x81,0xe0,0x00,0x00]
+        save
+
         ! CHECK: sethi %hi(40000), %g1            ! encoding: [0x03,0b00AAAAAA,A,A]
         ! CHECK:                                  !   fixup A - offset: 0, value: %hi(40000), kind: fixup_sparc_hi22
         ! CHECK: or %g1, %lo(40000), %g1          ! encoding: [0x82,0x10,0b011000AA,A]
@@ -10,8 +30,116 @@
         ! CHECK: mov      %lo(1), %g1             ! encoding: [0x82,0x10,0b001000AA,A]
         ! CHECK:                                  !   fixup A - offset: 0, value: %lo(1), kind: fixup_sparc_lo10
         set 1, %g1
-
         ! CHECK: sethi %hi(32768), %g1            ! encoding: [0x03,0b00AAAAAA,A,A]
         ! CHECK:                                  !   fixup A - offset: 0, value: %hi(32768), kind: fixup_sparc_hi22
         set 32768, %g1
 
+        ! CHECK: xnor %g1, %g0, %g2               ! encoding: [0x84,0x38,0x40,0x00]
+        not %g1, %g2
+        ! CHECK: xnor %g1, %g0, %g1               ! encoding: [0x82,0x38,0x40,0x00]
+        not %g1
+
+        ! CHECK: sub %g0, %g1, %g2                ! encoding: [0x84,0x20,0x00,0x01]
+        neg %g1, %g2
+        ! CHECK: sub %g0, %g1, %g1                ! encoding: [0x82,0x20,0x00,0x01]
+        neg %g1
+
+        ! CHECK: add %g1, 1, %g1                  ! encoding: [0x82,0x00,0x60,0x01]
+        inc %g1
+        ! CHECK: add %g1, 55, %g1                 ! encoding: [0x82,0x00,0x60,0x37]
+        inc 55, %g1
+        ! CHECK: addcc %g1, 1, %g1                ! encoding: [0x82,0x80,0x60,0x01]
+        inccc %g1
+        ! CHECK: addcc %g1, 55, %g1               ! encoding: [0x82,0x80,0x60,0x37]
+        inccc 55, %g1
+
+        ! CHECK: sub %g1, 1, %g1                  ! encoding: [0x82,0x20,0x60,0x01]
+        dec %g1
+        ! CHECK: sub %g1, 55, %g1                 ! encoding: [0x82,0x20,0x60,0x37]
+        dec 55, %g1
+        ! CHECK: subcc %g1, 1, %g1                ! encoding: [0x82,0xa0,0x60,0x01]
+        deccc %g1
+        ! CHECK: subcc %g1, 55, %g1               ! encoding: [0x82,0xa0,0x60,0x37]
+        deccc 55, %g1
+
+        ! CHECK: andcc %g2, %g1, %g0              ! encoding: [0x80,0x88,0x80,0x01]
+        btst %g1, %g2
+        ! CHECK: andcc %g2, 4, %g0                ! encoding: [0x80,0x88,0xa0,0x04]
+        btst 4, %g2
+        ! CHECK: or %g2, %g1, %g2                 ! encoding: [0x84,0x10,0x80,0x01]
+        bset %g1, %g2
+        ! CHECK: or %g2, 4, %g2                   ! encoding: [0x84,0x10,0xa0,0x04]
+        bset 4, %g2
+        ! CHECK: andn %g2, %g1, %g2               ! encoding: [0x84,0x28,0x80,0x01]
+        bclr %g1, %g2
+        ! CHECK: andn %g2, 4, %g2                 ! encoding: [0x84,0x28,0xa0,0x04]
+        bclr 4, %g2
+        ! CHECK: xor %g2, %g1, %g2                ! encoding: [0x84,0x18,0x80,0x01]
+        btog %g1, %g2
+        ! CHECK: xor %g2, 4, %g2                  ! encoding: [0x84,0x18,0xa0,0x04]
+        btog 4, %g2
+
+        ! CHECK: mov %g0, %g1                     ! encoding: [0x82,0x10,0x00,0x00]
+        clr %g1
+        ! CHECK: stb %g0, [%g1+%g2]               ! encoding: [0xc0,0x28,0x40,0x02]
+        clrb [%g1+%g2]
+        ! CHECK: sth %g0, [%g1+%g2]               ! encoding: [0xc0,0x30,0x40,0x02]
+        clrh [%g1+%g2]
+        ! CHECK: st %g0, [%g1+%g2]                ! encoding: [0xc0,0x20,0x40,0x02]
+        clr [%g1+%g2]
+
+        ! mov reg_or_imm,reg tested in sparc-alu-instructions.s
+
+        ! CHECK: rd %y, %i0                       ! encoding: [0xb1,0x40,0x00,0x00]
+        mov %y, %i0
+        ! CHECK: rd %asr1, %i0                    ! encoding: [0xb1,0x40,0x40,0x00]
+        mov %asr1, %i0
+        ! CHECK: rd %psr, %i0                     ! encoding: [0xb1,0x48,0x00,0x00]
+        mov %psr, %i0
+        ! CHECK: rd %wim, %i0                     ! encoding: [0xb1,0x50,0x00,0x00]
+        mov %wim, %i0
+        ! CHECK: rd %tbr, %i0                     ! encoding: [0xb1,0x58,0x00,0x00]
+        mov %tbr, %i0
+
+        ! CHECK: wr %g0, %i0, %y                  ! encoding: [0x81,0x80,0x00,0x18]
+        mov %i0, %y
+        ! CHECK: wr %g0, 5, %y                    ! encoding: [0x81,0x80,0x20,0x05]
+        mov 5, %y
+        ! CHECK: wr %g0, %i0, %asr15              ! encoding: [0x9f,0x80,0x00,0x18]
+        mov %i0, %asr15
+        ! CHECK: wr %g0, 5, %asr15                ! encoding: [0x9f,0x80,0x20,0x05]
+        mov 5, %asr15
+        ! CHECK: wr %g0, %i0, %psr                ! encoding: [0x81,0x88,0x00,0x18]
+        mov %i0, %psr
+        ! CHECK: wr %g0, 5, %psr                  ! encoding: [0x81,0x88,0x20,0x05]
+        mov 5, %psr
+        ! CHECK: wr %g0, %i0, %wim                ! encoding: [0x81,0x90,0x00,0x18]
+        mov %i0, %wim
+        ! CHECK: wr %g0, 5, %wim                  ! encoding: [0x81,0x90,0x20,0x05]
+        mov 5, %wim
+        ! CHECK: wr %g0, %i0, %tbr                ! encoding: [0x81,0x98,0x00,0x18]
+        mov %i0, %tbr
+        ! CHECK: wr %g0, 5, %tbr                  ! encoding: [0x81,0x98,0x20,0x05]
+        mov 5, %tbr
+
+! Other aliases
+        ! CHECK: wr %g0, %i0, %y                  ! encoding: [0x81,0x80,0x00,0x18]
+        wr %i0, %y
+        ! CHECK: wr %g0, 5, %y                    ! encoding: [0x81,0x80,0x20,0x05]
+        wr 5, %y
+        ! CHECK: wr %g0, %i0, %asr15              ! encoding: [0x9f,0x80,0x00,0x18]
+        wr %i0, %asr15
+        ! CHECK: wr %g0, 5, %asr15                ! encoding: [0x9f,0x80,0x20,0x05]
+        wr 5, %asr15
+        ! CHECK: wr %g0, %i0, %psr                ! encoding: [0x81,0x88,0x00,0x18]
+        wr %i0, %psr
+        ! CHECK: wr %g0, 5, %psr                  ! encoding: [0x81,0x88,0x20,0x05]
+        wr 5, %psr
+        ! CHECK: wr %g0, %i0, %wim                ! encoding: [0x81,0x90,0x00,0x18]
+        wr %i0, %wim
+        ! CHECK: wr %g0, 5, %wim                  ! encoding: [0x81,0x90,0x20,0x05]
+        wr 5, %wim
+        ! CHECK: wr %g0, %i0, %tbr                ! encoding: [0x81,0x98,0x00,0x18]
+        wr %i0, %tbr
+        ! CHECK: wr %g0, 5, %tbr                  ! encoding: [0x81,0x98,0x20,0x05]
+        wr 5, %tbr
diff --git a/test/MC/X86/AlignedBundling/nesting.s b/test/MC/X86/AlignedBundling/nesting.s
index 74b8fe9ff49b..16ed5a44da56 100644
--- a/test/MC/X86/AlignedBundling/nesting.s
+++ b/test/MC/X86/AlignedBundling/nesting.s
@@ -6,7 +6,8 @@
 # Will be bundle-aligning to 16 byte boundaries
   .bundle_align_mode 4
   .text
-# CHECK-LABEL: foo
+# CHECK-LABEL: foo:
+.type   foo,@function
 foo:
 # Test that bundle alignment mode can be set more than once.
   .bundle_align_mode 4
@@ -19,11 +20,12 @@ foo:
   callq bar     
   .bundle_unlock
   .bundle_unlock
-# CHECK:      10: callq
-# CHECK-NEXT: 15: callq
+# CHECK:      10: callq {{.*}} <bar>
+# CHECK-NEXT: 15: callq {{.*}} <bar>
 
   .p2align 4
-# CHECK-LABEL: bar
+# CHECK-LABEL: bar:
+.type   bar,@function
 bar:
   callq foo
   callq foo
@@ -35,10 +37,11 @@ bar:
   callq bar
   .bundle_unlock
   .bundle_unlock
-# CHECK:      36: callq
-# CHECK-NEXT: 3b: callq
+# CHECK:      36: callq {{.*}} <bar>
+# CHECK-NEXT: 3b: callq {{.*}} <bar>
 
-# CHECK-LABEL: baz
+# CHECK-LABEL: baz:
+.type   baz,@function
 baz:
   callq foo
   callq foo
@@ -50,10 +53,11 @@ baz:
   callq bar
   .bundle_unlock
   .bundle_unlock
-# CHECK:      56: callq
-# CHECK-NEXT: 5b: callq
+# CHECK:      56: callq {{.*}} <bar>
+# CHECK-NEXT: 5b: callq {{.*}} <bar>
 
 # CHECK-LABEL: quux
+.type   quux,@function
 quux:
   callq bar
   callq bar
@@ -65,5 +69,5 @@ quux:
   .bundle_unlock
 # Check that the calls are bundled together when the second one is after the
 # inner nest is closed.
-# CHECK:      70: callq
-# CHECK-NEXT: 75: callq
+# CHECK:      70: callq {{.*}} <bar>
+# CHECK-NEXT: 75: callq {{.*}} <bar>
diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index 079cb8850e6c..3bb7a5bcd2c3 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s
@@ -12714,6 +12714,138 @@ vpermilpd $0x23, 0x400(%rbx), %zmm2
 // CHECK:  encoding: [0x62,0xe2,0x4d,0x58,0x2c,0x9a,0xfc,0xfd,0xff,0xff]
           vscalefps -516(%rdx){1to16}, %zmm6, %zmm19
 
+// CHECK: vcvtps2pd %ymm6, %zmm13
+// CHECK:  encoding: [0x62,0x71,0x7c,0x48,0x5a,0xee]
+          vcvtps2pd %ymm6, %zmm13
+
+// CHECK: vcvtps2pd %ymm6, %zmm13 {%k3}
+// CHECK:  encoding: [0x62,0x71,0x7c,0x4b,0x5a,0xee]
+          vcvtps2pd %ymm6, %zmm13 {%k3}
+
+// CHECK: vcvtps2pd %ymm6, %zmm13 {%k3} {z}
+// CHECK:  encoding: [0x62,0x71,0x7c,0xcb,0x5a,0xee]
+          vcvtps2pd %ymm6, %zmm13 {%k3} {z}
+
+// CHECK: vcvtps2pd {sae}, %ymm6, %zmm13
+// CHECK:  encoding: [0x62,0x71,0x7c,0x18,0x5a,0xee]
+          vcvtps2pd {sae}, %ymm6, %zmm13
+
+// CHECK: vcvtps2pd (%rcx), %zmm13
+// CHECK:  encoding: [0x62,0x71,0x7c,0x48,0x5a,0x29]
+          vcvtps2pd (%rcx), %zmm13
+
+// CHECK: vcvtps2pd 291(%rax,%r14,8), %zmm13
+// CHECK:  encoding: [0x62,0x31,0x7c,0x48,0x5a,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vcvtps2pd 291(%rax,%r14,8), %zmm13
+
+// CHECK: vcvtps2pd (%rcx){1to8}, %zmm13
+// CHECK:  encoding: [0x62,0x71,0x7c,0x58,0x5a,0x29]
+          vcvtps2pd (%rcx){1to8}, %zmm13
+
+// CHECK: vcvtps2pd 4064(%rdx), %zmm13
+// CHECK:  encoding: [0x62,0x71,0x7c,0x48,0x5a,0x6a,0x7f]
+          vcvtps2pd 4064(%rdx), %zmm13
+
+// CHECK: vcvtps2pd 4096(%rdx), %zmm13
+// CHECK:  encoding: [0x62,0x71,0x7c,0x48,0x5a,0xaa,0x00,0x10,0x00,0x00]
+          vcvtps2pd 4096(%rdx), %zmm13
+
+// CHECK: vcvtps2pd -4096(%rdx), %zmm13
+// CHECK:  encoding: [0x62,0x71,0x7c,0x48,0x5a,0x6a,0x80]
+          vcvtps2pd -4096(%rdx), %zmm13
+
+// CHECK: vcvtps2pd -4128(%rdx), %zmm13
+// CHECK:  encoding: [0x62,0x71,0x7c,0x48,0x5a,0xaa,0xe0,0xef,0xff,0xff]
+          vcvtps2pd -4128(%rdx), %zmm13
+
+// CHECK: vcvtps2pd 508(%rdx){1to8}, %zmm13
+// CHECK:  encoding: [0x62,0x71,0x7c,0x58,0x5a,0x6a,0x7f]
+          vcvtps2pd 508(%rdx){1to8}, %zmm13
+
+// CHECK: vcvtps2pd 512(%rdx){1to8}, %zmm13
+// CHECK:  encoding: [0x62,0x71,0x7c,0x58,0x5a,0xaa,0x00,0x02,0x00,0x00]
+          vcvtps2pd 512(%rdx){1to8}, %zmm13
+
+// CHECK: vcvtps2pd -512(%rdx){1to8}, %zmm13
+// CHECK:  encoding: [0x62,0x71,0x7c,0x58,0x5a,0x6a,0x80]
+          vcvtps2pd -512(%rdx){1to8}, %zmm13
+
+// CHECK: vcvtps2pd -516(%rdx){1to8}, %zmm13
+// CHECK:  encoding: [0x62,0x71,0x7c,0x58,0x5a,0xaa,0xfc,0xfd,0xff,0xff]
+          vcvtps2pd -516(%rdx){1to8}, %zmm13
+
+// CHECK: vcvtpd2ps %zmm23, %ymm5
+// CHECK:  encoding: [0x62,0xb1,0xfd,0x48,0x5a,0xef]
+          vcvtpd2ps %zmm23, %ymm5
+
+// CHECK: vcvtpd2ps %zmm23, %ymm5 {%k5}
+// CHECK:  encoding: [0x62,0xb1,0xfd,0x4d,0x5a,0xef]
+          vcvtpd2ps %zmm23, %ymm5 {%k5}
+
+// CHECK: vcvtpd2ps %zmm23, %ymm5 {%k5} {z}
+// CHECK:  encoding: [0x62,0xb1,0xfd,0xcd,0x5a,0xef]
+          vcvtpd2ps %zmm23, %ymm5 {%k5} {z}
+
+// CHECK: vcvtpd2ps {rn-sae}, %zmm23, %ymm5
+// CHECK:  encoding: [0x62,0xb1,0xfd,0x18,0x5a,0xef]
+          vcvtpd2ps {rn-sae}, %zmm23, %ymm5
+
+// CHECK: vcvtpd2ps {ru-sae}, %zmm23, %ymm5
+// CHECK:  encoding: [0x62,0xb1,0xfd,0x58,0x5a,0xef]
+          vcvtpd2ps {ru-sae}, %zmm23, %ymm5
+
+// CHECK: vcvtpd2ps {rd-sae}, %zmm23, %ymm5
+// CHECK:  encoding: [0x62,0xb1,0xfd,0x38,0x5a,0xef]
+          vcvtpd2ps {rd-sae}, %zmm23, %ymm5
+
+// CHECK: vcvtpd2ps {rz-sae}, %zmm23, %ymm5
+// CHECK:  encoding: [0x62,0xb1,0xfd,0x78,0x5a,0xef]
+          vcvtpd2ps {rz-sae}, %zmm23, %ymm5
+
+// CHECK: vcvtpd2ps (%rcx), %ymm5
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x48,0x5a,0x29]
+          vcvtpd2ps (%rcx), %ymm5
+
+// CHECK: vcvtpd2ps 291(%rax,%r14,8), %ymm5
+// CHECK:  encoding: [0x62,0xb1,0xfd,0x48,0x5a,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vcvtpd2ps 291(%rax,%r14,8), %ymm5
+
+// CHECK: vcvtpd2ps (%rcx){1to8}, %ymm5
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x58,0x5a,0x29]
+          vcvtpd2ps (%rcx){1to8}, %ymm5
+
+// CHECK: vcvtpd2ps 8128(%rdx), %ymm5
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x48,0x5a,0x6a,0x7f]
+          vcvtpd2ps 8128(%rdx), %ymm5
+
+// CHECK: vcvtpd2ps 8192(%rdx), %ymm5
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x48,0x5a,0xaa,0x00,0x20,0x00,0x00]
+          vcvtpd2ps 8192(%rdx), %ymm5
+
+// CHECK: vcvtpd2ps -8192(%rdx), %ymm5
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x48,0x5a,0x6a,0x80]
+          vcvtpd2ps -8192(%rdx), %ymm5
+
+// CHECK: vcvtpd2ps -8256(%rdx), %ymm5
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x48,0x5a,0xaa,0xc0,0xdf,0xff,0xff]
+          vcvtpd2ps -8256(%rdx), %ymm5
+
+// CHECK: vcvtpd2ps 1016(%rdx){1to8}, %ymm5
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x58,0x5a,0x6a,0x7f]
+          vcvtpd2ps 1016(%rdx){1to8}, %ymm5
+
+// CHECK: vcvtpd2ps 1024(%rdx){1to8}, %ymm5
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x58,0x5a,0xaa,0x00,0x04,0x00,0x00]
+          vcvtpd2ps 1024(%rdx){1to8}, %ymm5
+
+// CHECK: vcvtpd2ps -1024(%rdx){1to8}, %ymm5
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x58,0x5a,0x6a,0x80]
+          vcvtpd2ps -1024(%rdx){1to8}, %ymm5
+
+// CHECK: vcvtpd2ps -1032(%rdx){1to8}, %ymm5
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x58,0x5a,0xaa,0xf8,0xfb,0xff,0xff]
+          vcvtpd2ps -1032(%rdx){1to8}, %ymm5
+
 // CHECK: vfmadd132ss %xmm22, %xmm17, %xmm30
 // CHECK:  encoding: [0x62,0x22,0x75,0x00,0x99,0xf6]
           vfmadd132ss %xmm22, %xmm17, %xmm30
diff --git a/test/MC/X86/x86-64-avx512bw.s b/test/MC/X86/x86-64-avx512bw.s
index fc6df8c2d40b..95eabfdb3411 100644
--- a/test/MC/X86/x86-64-avx512bw.s
+++ b/test/MC/X86/x86-64-avx512bw.s
@@ -3667,3 +3667,112 @@
 // CHECK: vpabsw -8256(%rdx), %zmm30
 // CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x1d,0xb2,0xc0,0xdf,0xff,0xff]
           vpabsw -8256(%rdx), %zmm30
+
+// CHECK: vpmulhuw %zmm21, %zmm24, %zmm21
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x40,0xe4,0xed]
+          vpmulhuw %zmm21, %zmm24, %zmm21
+
+// CHECK: vpmulhuw %zmm21, %zmm24, %zmm21 {%k3}
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x43,0xe4,0xed]
+          vpmulhuw %zmm21, %zmm24, %zmm21 {%k3}
+
+// CHECK: vpmulhuw %zmm21, %zmm24, %zmm21 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0x3d,0xc3,0xe4,0xed]
+          vpmulhuw %zmm21, %zmm24, %zmm21 {%k3} {z}
+
+// CHECK: vpmulhuw (%rcx), %zmm24, %zmm21
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xe4,0x29]
+          vpmulhuw (%rcx), %zmm24, %zmm21
+
+// CHECK: vpmulhuw 291(%rax,%r14,8), %zmm24, %zmm21
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x40,0xe4,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpmulhuw 291(%rax,%r14,8), %zmm24, %zmm21
+
+// CHECK: vpmulhuw 8128(%rdx), %zmm24, %zmm21
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xe4,0x6a,0x7f]
+          vpmulhuw 8128(%rdx), %zmm24, %zmm21
+
+// CHECK: vpmulhuw 8192(%rdx), %zmm24, %zmm21
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xe4,0xaa,0x00,0x20,0x00,0x00]
+          vpmulhuw 8192(%rdx), %zmm24, %zmm21
+
+// CHECK: vpmulhuw -8192(%rdx), %zmm24, %zmm21
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xe4,0x6a,0x80]
+          vpmulhuw -8192(%rdx), %zmm24, %zmm21
+
+// CHECK: vpmulhuw -8256(%rdx), %zmm24, %zmm21
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xe4,0xaa,0xc0,0xdf,0xff,0xff]
+          vpmulhuw -8256(%rdx), %zmm24, %zmm21
+
+// CHECK: vpmulhw %zmm27, %zmm26, %zmm30
+// CHECK:  encoding: [0x62,0x01,0x2d,0x40,0xe5,0xf3]
+          vpmulhw %zmm27, %zmm26, %zmm30
+
+// CHECK: vpmulhw %zmm27, %zmm26, %zmm30 {%k6}
+// CHECK:  encoding: [0x62,0x01,0x2d,0x46,0xe5,0xf3]
+          vpmulhw %zmm27, %zmm26, %zmm30 {%k6}
+
+// CHECK: vpmulhw %zmm27, %zmm26, %zmm30 {%k6} {z}
+// CHECK:  encoding: [0x62,0x01,0x2d,0xc6,0xe5,0xf3]
+          vpmulhw %zmm27, %zmm26, %zmm30 {%k6} {z}
+
+// CHECK: vpmulhw (%rcx), %zmm26, %zmm30
+// CHECK:  encoding: [0x62,0x61,0x2d,0x40,0xe5,0x31]
+          vpmulhw (%rcx), %zmm26, %zmm30
+
+// CHECK: vpmulhw 291(%rax,%r14,8), %zmm26, %zmm30
+// CHECK:  encoding: [0x62,0x21,0x2d,0x40,0xe5,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vpmulhw 291(%rax,%r14,8), %zmm26, %zmm30
+
+// CHECK: vpmulhw 8128(%rdx), %zmm26, %zmm30
+// CHECK:  encoding: [0x62,0x61,0x2d,0x40,0xe5,0x72,0x7f]
+          vpmulhw 8128(%rdx), %zmm26, %zmm30
+
+// CHECK: vpmulhw 8192(%rdx), %zmm26, %zmm30
+// CHECK:  encoding: [0x62,0x61,0x2d,0x40,0xe5,0xb2,0x00,0x20,0x00,0x00]
+          vpmulhw 8192(%rdx), %zmm26, %zmm30
+
+// CHECK: vpmulhw -8192(%rdx), %zmm26, %zmm30
+// CHECK:  encoding: [0x62,0x61,0x2d,0x40,0xe5,0x72,0x80]
+          vpmulhw -8192(%rdx), %zmm26, %zmm30
+
+// CHECK: vpmulhw -8256(%rdx), %zmm26, %zmm30
+// CHECK:  encoding: [0x62,0x61,0x2d,0x40,0xe5,0xb2,0xc0,0xdf,0xff,0xff]
+          vpmulhw -8256(%rdx), %zmm26, %zmm30
+
+// CHECK: vpmulhrsw %zmm25, %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0x82,0x25,0x40,0x0b,0xe9]
+          vpmulhrsw %zmm25, %zmm27, %zmm21
+
+// CHECK: vpmulhrsw %zmm25, %zmm27, %zmm21 {%k7}
+// CHECK:  encoding: [0x62,0x82,0x25,0x47,0x0b,0xe9]
+          vpmulhrsw %zmm25, %zmm27, %zmm21 {%k7}
+
+// CHECK: vpmulhrsw %zmm25, %zmm27, %zmm21 {%k7} {z}
+// CHECK:  encoding: [0x62,0x82,0x25,0xc7,0x0b,0xe9]
+          vpmulhrsw %zmm25, %zmm27, %zmm21 {%k7} {z}
+
+// CHECK: vpmulhrsw (%rcx), %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xe2,0x25,0x40,0x0b,0x29]
+          vpmulhrsw (%rcx), %zmm27, %zmm21
+
+// CHECK: vpmulhrsw 291(%rax,%r14,8), %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xa2,0x25,0x40,0x0b,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpmulhrsw 291(%rax,%r14,8), %zmm27, %zmm21
+
+// CHECK: vpmulhrsw 8128(%rdx), %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xe2,0x25,0x40,0x0b,0x6a,0x7f]
+          vpmulhrsw 8128(%rdx), %zmm27, %zmm21
+
+// CHECK: vpmulhrsw 8192(%rdx), %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xe2,0x25,0x40,0x0b,0xaa,0x00,0x20,0x00,0x00]
+          vpmulhrsw 8192(%rdx), %zmm27, %zmm21
+
+// CHECK: vpmulhrsw -8192(%rdx), %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xe2,0x25,0x40,0x0b,0x6a,0x80]
+          vpmulhrsw -8192(%rdx), %zmm27, %zmm21
+
+// CHECK: vpmulhrsw -8256(%rdx), %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xe2,0x25,0x40,0x0b,0xaa,0xc0,0xdf,0xff,0xff]
+          vpmulhrsw -8256(%rdx), %zmm27, %zmm21
+
diff --git a/test/MC/X86/x86-64-avx512bw_vl.s b/test/MC/X86/x86-64-avx512bw_vl.s
index 14a87df1ea83..014be27564bf 100644
--- a/test/MC/X86/x86-64-avx512bw_vl.s
+++ b/test/MC/X86/x86-64-avx512bw_vl.s
@@ -6583,3 +6583,219 @@
 // CHECK:  encoding: [0x62,0xe2,0x6d,0x20,0x00,0x9a,0xe0,0xef,0xff,0xff]
           vpshufb -4128(%rdx), %ymm18, %ymm19
 
+// CHECK: vpmulhuw %xmm18, %xmm21, %xmm24
+// CHECK:  encoding: [0x62,0x21,0x55,0x00,0xe4,0xc2]
+          vpmulhuw %xmm18, %xmm21, %xmm24
+
+// CHECK: vpmulhuw %xmm18, %xmm21, %xmm24 {%k3}
+// CHECK:  encoding: [0x62,0x21,0x55,0x03,0xe4,0xc2]
+          vpmulhuw %xmm18, %xmm21, %xmm24 {%k3}
+
+// CHECK: vpmulhuw %xmm18, %xmm21, %xmm24 {%k3} {z}
+// CHECK:  encoding: [0x62,0x21,0x55,0x83,0xe4,0xc2]
+          vpmulhuw %xmm18, %xmm21, %xmm24 {%k3} {z}
+
+// CHECK: vpmulhuw (%rcx), %xmm21, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x55,0x00,0xe4,0x01]
+          vpmulhuw (%rcx), %xmm21, %xmm24
+
+// CHECK: vpmulhuw 291(%rax,%r14,8), %xmm21, %xmm24
+// CHECK:  encoding: [0x62,0x21,0x55,0x00,0xe4,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vpmulhuw 291(%rax,%r14,8), %xmm21, %xmm24
+
+// CHECK: vpmulhuw 2032(%rdx), %xmm21, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x55,0x00,0xe4,0x42,0x7f]
+          vpmulhuw 2032(%rdx), %xmm21, %xmm24
+
+// CHECK: vpmulhuw 2048(%rdx), %xmm21, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x55,0x00,0xe4,0x82,0x00,0x08,0x00,0x00]
+          vpmulhuw 2048(%rdx), %xmm21, %xmm24
+
+// CHECK: vpmulhuw -2048(%rdx), %xmm21, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x55,0x00,0xe4,0x42,0x80]
+          vpmulhuw -2048(%rdx), %xmm21, %xmm24
+
+// CHECK: vpmulhuw -2064(%rdx), %xmm21, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x55,0x00,0xe4,0x82,0xf0,0xf7,0xff,0xff]
+          vpmulhuw -2064(%rdx), %xmm21, %xmm24
+
+// CHECK: vpmulhuw %ymm19, %ymm28, %ymm19
+// CHECK:  encoding: [0x62,0xa1,0x1d,0x20,0xe4,0xdb]
+          vpmulhuw %ymm19, %ymm28, %ymm19
+
+// CHECK: vpmulhuw %ymm19, %ymm28, %ymm19 {%k2}
+// CHECK:  encoding: [0x62,0xa1,0x1d,0x22,0xe4,0xdb]
+          vpmulhuw %ymm19, %ymm28, %ymm19 {%k2}
+
+// CHECK: vpmulhuw %ymm19, %ymm28, %ymm19 {%k2} {z}
+// CHECK:  encoding: [0x62,0xa1,0x1d,0xa2,0xe4,0xdb]
+          vpmulhuw %ymm19, %ymm28, %ymm19 {%k2} {z}
+
+// CHECK: vpmulhuw (%rcx), %ymm28, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x1d,0x20,0xe4,0x19]
+          vpmulhuw (%rcx), %ymm28, %ymm19
+
+// CHECK: vpmulhuw 291(%rax,%r14,8), %ymm28, %ymm19
+// CHECK:  encoding: [0x62,0xa1,0x1d,0x20,0xe4,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpmulhuw 291(%rax,%r14,8), %ymm28, %ymm19
+
+// CHECK: vpmulhuw 4064(%rdx), %ymm28, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x1d,0x20,0xe4,0x5a,0x7f]
+          vpmulhuw 4064(%rdx), %ymm28, %ymm19
+
+// CHECK: vpmulhuw 4096(%rdx), %ymm28, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x1d,0x20,0xe4,0x9a,0x00,0x10,0x00,0x00]
+          vpmulhuw 4096(%rdx), %ymm28, %ymm19
+
+// CHECK: vpmulhuw -4096(%rdx), %ymm28, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x1d,0x20,0xe4,0x5a,0x80]
+          vpmulhuw -4096(%rdx), %ymm28, %ymm19
+
+// CHECK: vpmulhuw -4128(%rdx), %ymm28, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x1d,0x20,0xe4,0x9a,0xe0,0xef,0xff,0xff]
+          vpmulhuw -4128(%rdx), %ymm28, %ymm19
+
+// CHECK: vpmulhw %xmm25, %xmm20, %xmm22
+// CHECK:  encoding: [0x62,0x81,0x5d,0x00,0xe5,0xf1]
+          vpmulhw %xmm25, %xmm20, %xmm22
+
+// CHECK: vpmulhw %xmm25, %xmm20, %xmm22 {%k2}
+// CHECK:  encoding: [0x62,0x81,0x5d,0x02,0xe5,0xf1]
+          vpmulhw %xmm25, %xmm20, %xmm22 {%k2}
+
+// CHECK: vpmulhw %xmm25, %xmm20, %xmm22 {%k2} {z}
+// CHECK:  encoding: [0x62,0x81,0x5d,0x82,0xe5,0xf1]
+          vpmulhw %xmm25, %xmm20, %xmm22 {%k2} {z}
+
+// CHECK: vpmulhw (%rcx), %xmm20, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x5d,0x00,0xe5,0x31]
+          vpmulhw (%rcx), %xmm20, %xmm22
+
+// CHECK: vpmulhw 291(%rax,%r14,8), %xmm20, %xmm22
+// CHECK:  encoding: [0x62,0xa1,0x5d,0x00,0xe5,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vpmulhw 291(%rax,%r14,8), %xmm20, %xmm22
+
+// CHECK: vpmulhw 2032(%rdx), %xmm20, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x5d,0x00,0xe5,0x72,0x7f]
+          vpmulhw 2032(%rdx), %xmm20, %xmm22
+
+// CHECK: vpmulhw 2048(%rdx), %xmm20, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x5d,0x00,0xe5,0xb2,0x00,0x08,0x00,0x00]
+          vpmulhw 2048(%rdx), %xmm20, %xmm22
+
+// CHECK: vpmulhw -2048(%rdx), %xmm20, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x5d,0x00,0xe5,0x72,0x80]
+          vpmulhw -2048(%rdx), %xmm20, %xmm22
+
+// CHECK: vpmulhw -2064(%rdx), %xmm20, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x5d,0x00,0xe5,0xb2,0xf0,0xf7,0xff,0xff]
+          vpmulhw -2064(%rdx), %xmm20, %xmm22
+
+// CHECK: vpmulhw %ymm24, %ymm27, %ymm22
+// CHECK:  encoding: [0x62,0x81,0x25,0x20,0xe5,0xf0]
+          vpmulhw %ymm24, %ymm27, %ymm22
+
+// CHECK: vpmulhw %ymm24, %ymm27, %ymm22 {%k1}
+// CHECK:  encoding: [0x62,0x81,0x25,0x21,0xe5,0xf0]
+          vpmulhw %ymm24, %ymm27, %ymm22 {%k1}
+
+// CHECK: vpmulhw %ymm24, %ymm27, %ymm22 {%k1} {z}
+// CHECK:  encoding: [0x62,0x81,0x25,0xa1,0xe5,0xf0]
+          vpmulhw %ymm24, %ymm27, %ymm22 {%k1} {z}
+
+// CHECK: vpmulhw (%rcx), %ymm27, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x25,0x20,0xe5,0x31]
+          vpmulhw (%rcx), %ymm27, %ymm22
+
+// CHECK: vpmulhw 291(%rax,%r14,8), %ymm27, %ymm22
+// CHECK:  encoding: [0x62,0xa1,0x25,0x20,0xe5,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vpmulhw 291(%rax,%r14,8), %ymm27, %ymm22
+
+// CHECK: vpmulhw 4064(%rdx), %ymm27, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x25,0x20,0xe5,0x72,0x7f]
+          vpmulhw 4064(%rdx), %ymm27, %ymm22
+
+// CHECK: vpmulhw 4096(%rdx), %ymm27, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x25,0x20,0xe5,0xb2,0x00,0x10,0x00,0x00]
+          vpmulhw 4096(%rdx), %ymm27, %ymm22
+
+// CHECK: vpmulhw -4096(%rdx), %ymm27, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x25,0x20,0xe5,0x72,0x80]
+          vpmulhw -4096(%rdx), %ymm27, %ymm22
+
+// CHECK: vpmulhw -4128(%rdx), %ymm27, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x25,0x20,0xe5,0xb2,0xe0,0xef,0xff,0xff]
+          vpmulhw -4128(%rdx), %ymm27, %ymm22
+
+// CHECK: vpmulhrsw %xmm26, %xmm19, %xmm28
+// CHECK:  encoding: [0x62,0x02,0x65,0x00,0x0b,0xe2]
+          vpmulhrsw %xmm26, %xmm19, %xmm28
+
+// CHECK: vpmulhrsw %xmm26, %xmm19, %xmm28 {%k6}
+// CHECK:  encoding: [0x62,0x02,0x65,0x06,0x0b,0xe2]
+          vpmulhrsw %xmm26, %xmm19, %xmm28 {%k6}
+
+// CHECK: vpmulhrsw %xmm26, %xmm19, %xmm28 {%k6} {z}
+// CHECK:  encoding: [0x62,0x02,0x65,0x86,0x0b,0xe2]
+          vpmulhrsw %xmm26, %xmm19, %xmm28 {%k6} {z}
+
+// CHECK: vpmulhrsw (%rcx), %xmm19, %xmm28
+// CHECK:  encoding: [0x62,0x62,0x65,0x00,0x0b,0x21]
+          vpmulhrsw (%rcx), %xmm19, %xmm28
+
+// CHECK: vpmulhrsw 291(%rax,%r14,8), %xmm19, %xmm28
+// CHECK:  encoding: [0x62,0x22,0x65,0x00,0x0b,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpmulhrsw 291(%rax,%r14,8), %xmm19, %xmm28
+
+// CHECK: vpmulhrsw 2032(%rdx), %xmm19, %xmm28
+// CHECK:  encoding: [0x62,0x62,0x65,0x00,0x0b,0x62,0x7f]
+          vpmulhrsw 2032(%rdx), %xmm19, %xmm28
+
+// CHECK: vpmulhrsw 2048(%rdx), %xmm19, %xmm28
+// CHECK:  encoding: [0x62,0x62,0x65,0x00,0x0b,0xa2,0x00,0x08,0x00,0x00]
+          vpmulhrsw 2048(%rdx), %xmm19, %xmm28
+
+// CHECK: vpmulhrsw -2048(%rdx), %xmm19, %xmm28
+// CHECK:  encoding: [0x62,0x62,0x65,0x00,0x0b,0x62,0x80]
+          vpmulhrsw -2048(%rdx), %xmm19, %xmm28
+
+// CHECK: vpmulhrsw -2064(%rdx), %xmm19, %xmm28
+// CHECK:  encoding: [0x62,0x62,0x65,0x00,0x0b,0xa2,0xf0,0xf7,0xff,0xff]
+          vpmulhrsw -2064(%rdx), %xmm19, %xmm28
+
+// CHECK: vpmulhrsw %ymm26, %ymm20, %ymm28
+// CHECK:  encoding: [0x62,0x02,0x5d,0x20,0x0b,0xe2]
+          vpmulhrsw %ymm26, %ymm20, %ymm28
+
+// CHECK: vpmulhrsw %ymm26, %ymm20, %ymm28 {%k3}
+// CHECK:  encoding: [0x62,0x02,0x5d,0x23,0x0b,0xe2]
+          vpmulhrsw %ymm26, %ymm20, %ymm28 {%k3}
+
+// CHECK: vpmulhrsw %ymm26, %ymm20, %ymm28 {%k3} {z}
+// CHECK:  encoding: [0x62,0x02,0x5d,0xa3,0x0b,0xe2]
+          vpmulhrsw %ymm26, %ymm20, %ymm28 {%k3} {z}
+
+// CHECK: vpmulhrsw (%rcx), %ymm20, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x5d,0x20,0x0b,0x21]
+          vpmulhrsw (%rcx), %ymm20, %ymm28
+
+// CHECK: vpmulhrsw 291(%rax,%r14,8), %ymm20, %ymm28
+// CHECK:  encoding: [0x62,0x22,0x5d,0x20,0x0b,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpmulhrsw 291(%rax,%r14,8), %ymm20, %ymm28
+
+// CHECK: vpmulhrsw 4064(%rdx), %ymm20, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x5d,0x20,0x0b,0x62,0x7f]
+          vpmulhrsw 4064(%rdx), %ymm20, %ymm28
+
+// CHECK: vpmulhrsw 4096(%rdx), %ymm20, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x5d,0x20,0x0b,0xa2,0x00,0x10,0x00,0x00]
+          vpmulhrsw 4096(%rdx), %ymm20, %ymm28
+
+// CHECK: vpmulhrsw -4096(%rdx), %ymm20, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x5d,0x20,0x0b,0x62,0x80]
+          vpmulhrsw -4096(%rdx), %ymm20, %ymm28
+
+// CHECK: vpmulhrsw -4128(%rdx), %ymm20, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x5d,0x20,0x0b,0xa2,0xe0,0xef,0xff,0xff]
+          vpmulhrsw -4128(%rdx), %ymm20, %ymm28
+
diff --git a/test/MC/X86/x86-64-avx512dq.s b/test/MC/X86/x86-64-avx512dq.s
index 92656dd3405e..4b26f7a0b80e 100644
--- a/test/MC/X86/x86-64-avx512dq.s
+++ b/test/MC/X86/x86-64-avx512dq.s
@@ -1390,3 +1390,520 @@
 // CHECK: vrangess $123, -516(%rdx), %xmm24, %xmm25
 // CHECK:  encoding: [0x62,0x63,0x3d,0x00,0x51,0x8a,0xfc,0xfd,0xff,0xff,0x7b]
           vrangess $0x7b,-516(%rdx), %xmm24, %xmm25
+
+// CHECK: vcvtpd2qq %zmm29, %zmm18
+// CHECK:  encoding: [0x62,0x81,0xfd,0x48,0x7b,0xd5]
+          vcvtpd2qq %zmm29, %zmm18
+
+// CHECK: vcvtpd2qq %zmm29, %zmm18 {%k6}
+// CHECK:  encoding: [0x62,0x81,0xfd,0x4e,0x7b,0xd5]
+          vcvtpd2qq %zmm29, %zmm18 {%k6}
+
+// CHECK: vcvtpd2qq %zmm29, %zmm18 {%k6} {z}
+// CHECK:  encoding: [0x62,0x81,0xfd,0xce,0x7b,0xd5]
+          vcvtpd2qq %zmm29, %zmm18 {%k6} {z}
+
+// CHECK: vcvtpd2qq {rn-sae}, %zmm29, %zmm18
+// CHECK:  encoding: [0x62,0x81,0xfd,0x18,0x7b,0xd5]
+          vcvtpd2qq {rn-sae}, %zmm29, %zmm18
+
+// CHECK: vcvtpd2qq {ru-sae}, %zmm29, %zmm18
+// CHECK:  encoding: [0x62,0x81,0xfd,0x58,0x7b,0xd5]
+          vcvtpd2qq {ru-sae}, %zmm29, %zmm18
+
+// CHECK: vcvtpd2qq {rd-sae}, %zmm29, %zmm18
+// CHECK:  encoding: [0x62,0x81,0xfd,0x38,0x7b,0xd5]
+          vcvtpd2qq {rd-sae}, %zmm29, %zmm18
+
+// CHECK: vcvtpd2qq {rz-sae}, %zmm29, %zmm18
+// CHECK:  encoding: [0x62,0x81,0xfd,0x78,0x7b,0xd5]
+          vcvtpd2qq {rz-sae}, %zmm29, %zmm18
+
+// CHECK: vcvtpd2qq (%rcx), %zmm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x7b,0x11]
+          vcvtpd2qq (%rcx), %zmm18
+
+// CHECK: vcvtpd2qq 291(%rax,%r14,8), %zmm18
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x48,0x7b,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vcvtpd2qq 291(%rax,%r14,8), %zmm18
+
+// CHECK: vcvtpd2qq (%rcx){1to8}, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x7b,0x11]
+          vcvtpd2qq (%rcx){1to8}, %zmm18
+
+// CHECK: vcvtpd2qq 8128(%rdx), %zmm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x7b,0x52,0x7f]
+          vcvtpd2qq 8128(%rdx), %zmm18
+
+// CHECK: vcvtpd2qq 8192(%rdx), %zmm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x7b,0x92,0x00,0x20,0x00,0x00]
+          vcvtpd2qq 8192(%rdx), %zmm18
+
+// CHECK: vcvtpd2qq -8192(%rdx), %zmm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x7b,0x52,0x80]
+          vcvtpd2qq -8192(%rdx), %zmm18
+
+// CHECK: vcvtpd2qq -8256(%rdx), %zmm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x7b,0x92,0xc0,0xdf,0xff,0xff]
+          vcvtpd2qq -8256(%rdx), %zmm18
+
+// CHECK: vcvtpd2qq 1016(%rdx){1to8}, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x7b,0x52,0x7f]
+          vcvtpd2qq 1016(%rdx){1to8}, %zmm18
+
+// CHECK: vcvtpd2qq 1024(%rdx){1to8}, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x7b,0x92,0x00,0x04,0x00,0x00]
+          vcvtpd2qq 1024(%rdx){1to8}, %zmm18
+
+// CHECK: vcvtpd2qq -1024(%rdx){1to8}, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x7b,0x52,0x80]
+          vcvtpd2qq -1024(%rdx){1to8}, %zmm18
+
+// CHECK: vcvtpd2qq -1032(%rdx){1to8}, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x7b,0x92,0xf8,0xfb,0xff,0xff]
+          vcvtpd2qq -1032(%rdx){1to8}, %zmm18
+
+// CHECK: vcvtpd2uqq %zmm21, %zmm22
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x48,0x79,0xf5]
+          vcvtpd2uqq %zmm21, %zmm22
+
+// CHECK: vcvtpd2uqq %zmm21, %zmm22 {%k5}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x4d,0x79,0xf5]
+          vcvtpd2uqq %zmm21, %zmm22 {%k5}
+
+// CHECK: vcvtpd2uqq %zmm21, %zmm22 {%k5} {z}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0xcd,0x79,0xf5]
+          vcvtpd2uqq %zmm21, %zmm22 {%k5} {z}
+
+// CHECK: vcvtpd2uqq {rn-sae}, %zmm21, %zmm22
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x18,0x79,0xf5]
+          vcvtpd2uqq {rn-sae}, %zmm21, %zmm22
+
+// CHECK: vcvtpd2uqq {ru-sae}, %zmm21, %zmm22
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x58,0x79,0xf5]
+          vcvtpd2uqq {ru-sae}, %zmm21, %zmm22
+
+// CHECK: vcvtpd2uqq {rd-sae}, %zmm21, %zmm22
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x38,0x79,0xf5]
+          vcvtpd2uqq {rd-sae}, %zmm21, %zmm22
+
+// CHECK: vcvtpd2uqq {rz-sae}, %zmm21, %zmm22
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x78,0x79,0xf5]
+          vcvtpd2uqq {rz-sae}, %zmm21, %zmm22
+
+// CHECK: vcvtpd2uqq (%rcx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x79,0x31]
+          vcvtpd2uqq (%rcx), %zmm22
+
+// CHECK: vcvtpd2uqq 291(%rax,%r14,8), %zmm22
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x48,0x79,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtpd2uqq 291(%rax,%r14,8), %zmm22
+
+// CHECK: vcvtpd2uqq (%rcx){1to8}, %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x79,0x31]
+          vcvtpd2uqq (%rcx){1to8}, %zmm22
+
+// CHECK: vcvtpd2uqq 8128(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x79,0x72,0x7f]
+          vcvtpd2uqq 8128(%rdx), %zmm22
+
+// CHECK: vcvtpd2uqq 8192(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x79,0xb2,0x00,0x20,0x00,0x00]
+          vcvtpd2uqq 8192(%rdx), %zmm22
+
+// CHECK: vcvtpd2uqq -8192(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x79,0x72,0x80]
+          vcvtpd2uqq -8192(%rdx), %zmm22
+
+// CHECK: vcvtpd2uqq -8256(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x79,0xb2,0xc0,0xdf,0xff,0xff]
+          vcvtpd2uqq -8256(%rdx), %zmm22
+
+// CHECK: vcvtpd2uqq 1016(%rdx){1to8}, %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x79,0x72,0x7f]
+          vcvtpd2uqq 1016(%rdx){1to8}, %zmm22
+
+// CHECK: vcvtpd2uqq 1024(%rdx){1to8}, %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x79,0xb2,0x00,0x04,0x00,0x00]
+          vcvtpd2uqq 1024(%rdx){1to8}, %zmm22
+
+// CHECK: vcvtpd2uqq -1024(%rdx){1to8}, %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x79,0x72,0x80]
+          vcvtpd2uqq -1024(%rdx){1to8}, %zmm22
+
+// CHECK: vcvtpd2uqq -1032(%rdx){1to8}, %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x79,0xb2,0xf8,0xfb,0xff,0xff]
+          vcvtpd2uqq -1032(%rdx){1to8}, %zmm22
+
+// CHECK: vcvtps2qq %ymm18, %zmm20
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x48,0x7b,0xe2]
+          vcvtps2qq %ymm18, %zmm20
+
+// CHECK: vcvtps2qq %ymm18, %zmm20 {%k2}
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x4a,0x7b,0xe2]
+          vcvtps2qq %ymm18, %zmm20 {%k2}
+
+// CHECK: vcvtps2qq %ymm18, %zmm20 {%k2} {z}
+// CHECK:  encoding: [0x62,0xa1,0x7d,0xca,0x7b,0xe2]
+          vcvtps2qq %ymm18, %zmm20 {%k2} {z}
+
+// CHECK: vcvtps2qq {rn-sae}, %ymm18, %zmm20
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x18,0x7b,0xe2]
+          vcvtps2qq {rn-sae}, %ymm18, %zmm20
+
+// CHECK: vcvtps2qq {ru-sae}, %ymm18, %zmm20
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x58,0x7b,0xe2]
+          vcvtps2qq {ru-sae}, %ymm18, %zmm20
+
+// CHECK: vcvtps2qq {rd-sae}, %ymm18, %zmm20
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x38,0x7b,0xe2]
+          vcvtps2qq {rd-sae}, %ymm18, %zmm20
+
+// CHECK: vcvtps2qq {rz-sae}, %ymm18, %zmm20
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x78,0x7b,0xe2]
+          vcvtps2qq {rz-sae}, %ymm18, %zmm20
+
+// CHECK: vcvtps2qq (%rcx), %zmm20
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x7b,0x21]
+          vcvtps2qq (%rcx), %zmm20
+
+// CHECK: vcvtps2qq 291(%rax,%r14,8), %zmm20
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x48,0x7b,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtps2qq 291(%rax,%r14,8), %zmm20
+
+// CHECK: vcvtps2qq (%rcx){1to8}, %zmm20
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x58,0x7b,0x21]
+          vcvtps2qq (%rcx){1to8}, %zmm20
+
+// CHECK: vcvtps2qq 4064(%rdx), %zmm20
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x7b,0x62,0x7f]
+          vcvtps2qq 4064(%rdx), %zmm20
+
+// CHECK: vcvtps2qq 4096(%rdx), %zmm20
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x7b,0xa2,0x00,0x10,0x00,0x00]
+          vcvtps2qq 4096(%rdx), %zmm20
+
+// CHECK: vcvtps2qq -4096(%rdx), %zmm20
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x7b,0x62,0x80]
+          vcvtps2qq -4096(%rdx), %zmm20
+
+// CHECK: vcvtps2qq -4128(%rdx), %zmm20
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x7b,0xa2,0xe0,0xef,0xff,0xff]
+          vcvtps2qq -4128(%rdx), %zmm20
+
+// CHECK: vcvtps2qq 508(%rdx){1to8}, %zmm20
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x58,0x7b,0x62,0x7f]
+          vcvtps2qq 508(%rdx){1to8}, %zmm20
+
+// CHECK: vcvtps2qq 512(%rdx){1to8}, %zmm20
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x58,0x7b,0xa2,0x00,0x02,0x00,0x00]
+          vcvtps2qq 512(%rdx){1to8}, %zmm20
+
+// CHECK: vcvtps2qq -512(%rdx){1to8}, %zmm20
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x58,0x7b,0x62,0x80]
+          vcvtps2qq -512(%rdx){1to8}, %zmm20
+
+// CHECK: vcvtps2qq -516(%rdx){1to8}, %zmm20
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x58,0x7b,0xa2,0xfc,0xfd,0xff,0xff]
+          vcvtps2qq -516(%rdx){1to8}, %zmm20
+
+// CHECK: vcvtps2uqq %ymm27, %zmm25
+// CHECK:  encoding: [0x62,0x01,0x7d,0x48,0x79,0xcb]
+          vcvtps2uqq %ymm27, %zmm25
+
+// CHECK: vcvtps2uqq %ymm27, %zmm25 {%k5}
+// CHECK:  encoding: [0x62,0x01,0x7d,0x4d,0x79,0xcb]
+          vcvtps2uqq %ymm27, %zmm25 {%k5}
+
+// CHECK: vcvtps2uqq %ymm27, %zmm25 {%k5} {z}
+// CHECK:  encoding: [0x62,0x01,0x7d,0xcd,0x79,0xcb]
+          vcvtps2uqq %ymm27, %zmm25 {%k5} {z}
+
+// CHECK: vcvtps2uqq {rn-sae}, %ymm27, %zmm25
+// CHECK:  encoding: [0x62,0x01,0x7d,0x18,0x79,0xcb]
+          vcvtps2uqq {rn-sae}, %ymm27, %zmm25
+
+// CHECK: vcvtps2uqq {ru-sae}, %ymm27, %zmm25
+// CHECK:  encoding: [0x62,0x01,0x7d,0x58,0x79,0xcb]
+          vcvtps2uqq {ru-sae}, %ymm27, %zmm25
+
+// CHECK: vcvtps2uqq {rd-sae}, %ymm27, %zmm25
+// CHECK:  encoding: [0x62,0x01,0x7d,0x38,0x79,0xcb]
+          vcvtps2uqq {rd-sae}, %ymm27, %zmm25
+
+// CHECK: vcvtps2uqq {rz-sae}, %ymm27, %zmm25
+// CHECK:  encoding: [0x62,0x01,0x7d,0x78,0x79,0xcb]
+          vcvtps2uqq {rz-sae}, %ymm27, %zmm25
+
+// CHECK: vcvtps2uqq (%rcx), %zmm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x48,0x79,0x09]
+          vcvtps2uqq (%rcx), %zmm25
+
+// CHECK: vcvtps2uqq 291(%rax,%r14,8), %zmm25
+// CHECK:  encoding: [0x62,0x21,0x7d,0x48,0x79,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vcvtps2uqq 291(%rax,%r14,8), %zmm25
+
+// CHECK: vcvtps2uqq (%rcx){1to8}, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x58,0x79,0x09]
+          vcvtps2uqq (%rcx){1to8}, %zmm25
+
+// CHECK: vcvtps2uqq 4064(%rdx), %zmm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x48,0x79,0x4a,0x7f]
+          vcvtps2uqq 4064(%rdx), %zmm25
+
+// CHECK: vcvtps2uqq 4096(%rdx), %zmm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x48,0x79,0x8a,0x00,0x10,0x00,0x00]
+          vcvtps2uqq 4096(%rdx), %zmm25
+
+// CHECK: vcvtps2uqq -4096(%rdx), %zmm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x48,0x79,0x4a,0x80]
+          vcvtps2uqq -4096(%rdx), %zmm25
+
+// CHECK: vcvtps2uqq -4128(%rdx), %zmm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x48,0x79,0x8a,0xe0,0xef,0xff,0xff]
+          vcvtps2uqq -4128(%rdx), %zmm25
+
+// CHECK: vcvtps2uqq 508(%rdx){1to8}, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x58,0x79,0x4a,0x7f]
+          vcvtps2uqq 508(%rdx){1to8}, %zmm25
+
+// CHECK: vcvtps2uqq 512(%rdx){1to8}, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x58,0x79,0x8a,0x00,0x02,0x00,0x00]
+          vcvtps2uqq 512(%rdx){1to8}, %zmm25
+
+// CHECK: vcvtps2uqq -512(%rdx){1to8}, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x58,0x79,0x4a,0x80]
+          vcvtps2uqq -512(%rdx){1to8}, %zmm25
+
+// CHECK: vcvtps2uqq -516(%rdx){1to8}, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x58,0x79,0x8a,0xfc,0xfd,0xff,0xff]
+          vcvtps2uqq -516(%rdx){1to8}, %zmm25
+
+// CHECK: vcvtqq2pd %zmm25, %zmm17
+// CHECK:  encoding: [0x62,0x81,0xfe,0x48,0xe6,0xc9]
+          vcvtqq2pd %zmm25, %zmm17
+
+// CHECK: vcvtqq2pd %zmm25, %zmm17 {%k4}
+// CHECK:  encoding: [0x62,0x81,0xfe,0x4c,0xe6,0xc9]
+          vcvtqq2pd %zmm25, %zmm17 {%k4}
+
+// CHECK: vcvtqq2pd %zmm25, %zmm17 {%k4} {z}
+// CHECK:  encoding: [0x62,0x81,0xfe,0xcc,0xe6,0xc9]
+          vcvtqq2pd %zmm25, %zmm17 {%k4} {z}
+
+// CHECK: vcvtqq2pd {rn-sae}, %zmm25, %zmm17
+// CHECK:  encoding: [0x62,0x81,0xfe,0x18,0xe6,0xc9]
+          vcvtqq2pd {rn-sae}, %zmm25, %zmm17
+
+// CHECK: vcvtqq2pd {ru-sae}, %zmm25, %zmm17
+// CHECK:  encoding: [0x62,0x81,0xfe,0x58,0xe6,0xc9]
+          vcvtqq2pd {ru-sae}, %zmm25, %zmm17
+
+// CHECK: vcvtqq2pd {rd-sae}, %zmm25, %zmm17
+// CHECK:  encoding: [0x62,0x81,0xfe,0x38,0xe6,0xc9]
+          vcvtqq2pd {rd-sae}, %zmm25, %zmm17
+
+// CHECK: vcvtqq2pd {rz-sae}, %zmm25, %zmm17
+// CHECK:  encoding: [0x62,0x81,0xfe,0x78,0xe6,0xc9]
+          vcvtqq2pd {rz-sae}, %zmm25, %zmm17
+
+// CHECK: vcvtqq2pd (%rcx), %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x48,0xe6,0x09]
+          vcvtqq2pd (%rcx), %zmm17
+
+// CHECK: vcvtqq2pd 291(%rax,%r14,8), %zmm17
+// CHECK:  encoding: [0x62,0xa1,0xfe,0x48,0xe6,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vcvtqq2pd 291(%rax,%r14,8), %zmm17
+
+// CHECK: vcvtqq2pd (%rcx){1to8}, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x58,0xe6,0x09]
+          vcvtqq2pd (%rcx){1to8}, %zmm17
+
+// CHECK: vcvtqq2pd 8128(%rdx), %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x48,0xe6,0x4a,0x7f]
+          vcvtqq2pd 8128(%rdx), %zmm17
+
+// CHECK: vcvtqq2pd 8192(%rdx), %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x48,0xe6,0x8a,0x00,0x20,0x00,0x00]
+          vcvtqq2pd 8192(%rdx), %zmm17
+
+// CHECK: vcvtqq2pd -8192(%rdx), %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x48,0xe6,0x4a,0x80]
+          vcvtqq2pd -8192(%rdx), %zmm17
+
+// CHECK: vcvtqq2pd -8256(%rdx), %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x48,0xe6,0x8a,0xc0,0xdf,0xff,0xff]
+          vcvtqq2pd -8256(%rdx), %zmm17
+
+// CHECK: vcvtqq2pd 1016(%rdx){1to8}, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x58,0xe6,0x4a,0x7f]
+          vcvtqq2pd 1016(%rdx){1to8}, %zmm17
+
+// CHECK: vcvtqq2pd 1024(%rdx){1to8}, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x58,0xe6,0x8a,0x00,0x04,0x00,0x00]
+          vcvtqq2pd 1024(%rdx){1to8}, %zmm17
+
+// CHECK: vcvtqq2pd -1024(%rdx){1to8}, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x58,0xe6,0x4a,0x80]
+          vcvtqq2pd -1024(%rdx){1to8}, %zmm17
+
+// CHECK: vcvtqq2pd -1032(%rdx){1to8}, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x58,0xe6,0x8a,0xf8,0xfb,0xff,0xff]
+          vcvtqq2pd -1032(%rdx){1to8}, %zmm17
+
+// CHECK: vcvtqq2ps %zmm27, %ymm20
+// CHECK:  encoding: [0x62,0x81,0xfc,0x48,0x5b,0xe3]
+          vcvtqq2ps %zmm27, %ymm20
+
+// CHECK: vcvtqq2ps %zmm27, %ymm20 {%k5}
+// CHECK:  encoding: [0x62,0x81,0xfc,0x4d,0x5b,0xe3]
+          vcvtqq2ps %zmm27, %ymm20 {%k5}
+
+// CHECK: vcvtqq2ps %zmm27, %ymm20 {%k5} {z}
+// CHECK:  encoding: [0x62,0x81,0xfc,0xcd,0x5b,0xe3]
+          vcvtqq2ps %zmm27, %ymm20 {%k5} {z}
+
+// CHECK: vcvtqq2ps {rn-sae}, %zmm27, %ymm20
+// CHECK:  encoding: [0x62,0x81,0xfc,0x18,0x5b,0xe3]
+          vcvtqq2ps {rn-sae}, %zmm27, %ymm20
+
+// CHECK: vcvtqq2ps {ru-sae}, %zmm27, %ymm20
+// CHECK:  encoding: [0x62,0x81,0xfc,0x58,0x5b,0xe3]
+          vcvtqq2ps {ru-sae}, %zmm27, %ymm20
+
+// CHECK: vcvtqq2ps {rd-sae}, %zmm27, %ymm20
+// CHECK:  encoding: [0x62,0x81,0xfc,0x38,0x5b,0xe3]
+          vcvtqq2ps {rd-sae}, %zmm27, %ymm20
+
+// CHECK: vcvtqq2ps {rz-sae}, %zmm27, %ymm20
+// CHECK:  encoding: [0x62,0x81,0xfc,0x78,0x5b,0xe3]
+          vcvtqq2ps {rz-sae}, %zmm27, %ymm20
+
+// CHECK: vcvtqq2ps (%rcx), %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x48,0x5b,0x21]
+          vcvtqq2ps (%rcx), %ymm20
+
+// CHECK: vcvtqq2ps 291(%rax,%r14,8), %ymm20
+// CHECK:  encoding: [0x62,0xa1,0xfc,0x48,0x5b,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtqq2ps 291(%rax,%r14,8), %ymm20
+
+// CHECK: vcvtqq2ps (%rcx){1to8}, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x58,0x5b,0x21]
+          vcvtqq2ps (%rcx){1to8}, %ymm20
+
+// CHECK: vcvtqq2ps 8128(%rdx), %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x48,0x5b,0x62,0x7f]
+          vcvtqq2ps 8128(%rdx), %ymm20
+
+// CHECK: vcvtqq2ps 8192(%rdx), %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x48,0x5b,0xa2,0x00,0x20,0x00,0x00]
+          vcvtqq2ps 8192(%rdx), %ymm20
+
+// CHECK: vcvtqq2ps -8192(%rdx), %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x48,0x5b,0x62,0x80]
+          vcvtqq2ps -8192(%rdx), %ymm20
+
+// CHECK: vcvtqq2ps -8256(%rdx), %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x48,0x5b,0xa2,0xc0,0xdf,0xff,0xff]
+          vcvtqq2ps -8256(%rdx), %ymm20
+
+// CHECK: vcvtqq2ps 1016(%rdx){1to8}, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x58,0x5b,0x62,0x7f]
+          vcvtqq2ps 1016(%rdx){1to8}, %ymm20
+
+// CHECK: vcvtqq2ps 1024(%rdx){1to8}, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x58,0x5b,0xa2,0x00,0x04,0x00,0x00]
+          vcvtqq2ps 1024(%rdx){1to8}, %ymm20
+
+// CHECK: vcvtqq2ps -1024(%rdx){1to8}, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x58,0x5b,0x62,0x80]
+          vcvtqq2ps -1024(%rdx){1to8}, %ymm20
+
+// CHECK: vcvtqq2ps -1032(%rdx){1to8}, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x58,0x5b,0xa2,0xf8,0xfb,0xff,0xff]
+          vcvtqq2ps -1032(%rdx){1to8}, %ymm20
+
+// CHECK: vcvtuqq2pd %zmm29, %zmm21
+// CHECK:  encoding: [0x62,0x81,0xfe,0x48,0x7a,0xed]
+          vcvtuqq2pd %zmm29, %zmm21
+
+// CHECK: vcvtuqq2pd %zmm29, %zmm21 {%k6}
+// CHECK:  encoding: [0x62,0x81,0xfe,0x4e,0x7a,0xed]
+          vcvtuqq2pd %zmm29, %zmm21 {%k6}
+
+// CHECK: vcvtuqq2pd %zmm29, %zmm21 {%k6} {z}
+// CHECK:  encoding: [0x62,0x81,0xfe,0xce,0x7a,0xed]
+          vcvtuqq2pd %zmm29, %zmm21 {%k6} {z}
+
+// CHECK: vcvtuqq2pd {rn-sae}, %zmm29, %zmm21
+// CHECK:  encoding: [0x62,0x81,0xfe,0x18,0x7a,0xed]
+          vcvtuqq2pd {rn-sae}, %zmm29, %zmm21
+
+// CHECK: vcvtuqq2pd {ru-sae}, %zmm29, %zmm21
+// CHECK:  encoding: [0x62,0x81,0xfe,0x58,0x7a,0xed]
+          vcvtuqq2pd {ru-sae}, %zmm29, %zmm21
+
+// CHECK: vcvtuqq2pd {rd-sae}, %zmm29, %zmm21
+// CHECK:  encoding: [0x62,0x81,0xfe,0x38,0x7a,0xed]
+          vcvtuqq2pd {rd-sae}, %zmm29, %zmm21
+
+// CHECK: vcvtuqq2pd {rz-sae}, %zmm29, %zmm21
+// CHECK:  encoding: [0x62,0x81,0xfe,0x78,0x7a,0xed]
+          vcvtuqq2pd {rz-sae}, %zmm29, %zmm21
+
+// CHECK: vcvtuqq2pd (%rcx), %zmm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x48,0x7a,0x29]
+          vcvtuqq2pd (%rcx), %zmm21
+
+// CHECK: vcvtuqq2pd 291(%rax,%r14,8), %zmm21
+// CHECK:  encoding: [0x62,0xa1,0xfe,0x48,0x7a,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vcvtuqq2pd 291(%rax,%r14,8), %zmm21
+
+// CHECK: vcvtuqq2pd (%rcx){1to8}, %zmm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x58,0x7a,0x29]
+          vcvtuqq2pd (%rcx){1to8}, %zmm21
+
+// CHECK: vcvtuqq2pd 8128(%rdx), %zmm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x48,0x7a,0x6a,0x7f]
+          vcvtuqq2pd 8128(%rdx), %zmm21
+
+// CHECK: vcvtuqq2pd 8192(%rdx), %zmm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x48,0x7a,0xaa,0x00,0x20,0x00,0x00]
+          vcvtuqq2pd 8192(%rdx), %zmm21
+
+// CHECK: vcvtuqq2pd -8192(%rdx), %zmm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x48,0x7a,0x6a,0x80]
+          vcvtuqq2pd -8192(%rdx), %zmm21
+
+// CHECK: vcvtuqq2pd -8256(%rdx), %zmm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x48,0x7a,0xaa,0xc0,0xdf,0xff,0xff]
+          vcvtuqq2pd -8256(%rdx), %zmm21
+
+// CHECK: vcvtuqq2pd 1016(%rdx){1to8}, %zmm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x58,0x7a,0x6a,0x7f]
+          vcvtuqq2pd 1016(%rdx){1to8}, %zmm21
+
+// CHECK: vcvtuqq2pd 1024(%rdx){1to8}, %zmm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x58,0x7a,0xaa,0x00,0x04,0x00,0x00]
+          vcvtuqq2pd 1024(%rdx){1to8}, %zmm21
+
+// CHECK: vcvtuqq2pd -1024(%rdx){1to8}, %zmm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x58,0x7a,0x6a,0x80]
+          vcvtuqq2pd -1024(%rdx){1to8}, %zmm21
+
+// CHECK: vcvtuqq2pd -1032(%rdx){1to8}, %zmm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x58,0x7a,0xaa,0xf8,0xfb,0xff,0xff]
+          vcvtuqq2pd -1032(%rdx){1to8}, %zmm21
+
+// CHECK: vcvtuqq2ps %zmm21, %ymm18
+// CHECK:  encoding: [0x62,0xa1,0xff,0x48,0x7a,0xd5]
+          vcvtuqq2ps %zmm21, %ymm18
+
+// CHECK: vcvtuqq2ps %zmm21, %ymm18 {%k2}
+// CHECK:  encoding: [0x62,0xa1,0xff,0x4a,0x7a,0xd5]
+          vcvtuqq2ps %zmm21, %ymm18 {%k2}
+
+// CHECK: vcvtuqq2ps %zmm21, %ymm18 {%k2} {z}
+// CHECK:  encoding: [0x62,0xa1,0xff,0xca,0x7a,0xd5]
+          vcvtuqq2ps %zmm21, %ymm18 {%k2} {z}
+
diff --git a/test/MC/X86/x86-64-avx512dq_vl.s b/test/MC/X86/x86-64-avx512dq_vl.s
index d14ae6ec3741..17c37c08335c 100644
--- a/test/MC/X86/x86-64-avx512dq_vl.s
+++ b/test/MC/X86/x86-64-avx512dq_vl.s
@@ -2207,3 +2207,900 @@
 // CHECK: vrangeps $123, -516(%rdx){1to8}, %ymm23, %ymm24
 // CHECK:  encoding: [0x62,0x63,0x45,0x30,0x50,0x82,0xfc,0xfd,0xff,0xff,0x7b]
           vrangeps $0x7b,-516(%rdx){1to8}, %ymm23, %ymm24
+
+// CHECK: vcvtpd2qq %xmm22, %xmm24
+// CHECK:  encoding: [0x62,0x21,0xfd,0x08,0x7b,0xc6]
+          vcvtpd2qq %xmm22, %xmm24
+
+// CHECK: vcvtpd2qq %xmm22, %xmm24 {%k6}
+// CHECK:  encoding: [0x62,0x21,0xfd,0x0e,0x7b,0xc6]
+          vcvtpd2qq %xmm22, %xmm24 {%k6}
+
+// CHECK: vcvtpd2qq %xmm22, %xmm24 {%k6} {z}
+// CHECK:  encoding: [0x62,0x21,0xfd,0x8e,0x7b,0xc6]
+          vcvtpd2qq %xmm22, %xmm24 {%k6} {z}
+
+// CHECK: vcvtpd2qq (%rcx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x7b,0x01]
+          vcvtpd2qq (%rcx), %xmm24
+
+// CHECK: vcvtpd2qq 291(%rax,%r14,8), %xmm24
+// CHECK:  encoding: [0x62,0x21,0xfd,0x08,0x7b,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vcvtpd2qq 291(%rax,%r14,8), %xmm24
+
+// CHECK: vcvtpd2qq (%rcx){1to2}, %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x7b,0x01]
+          vcvtpd2qq (%rcx){1to2}, %xmm24
+
+// CHECK: vcvtpd2qq 2032(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x7b,0x42,0x7f]
+          vcvtpd2qq 2032(%rdx), %xmm24
+
+// CHECK: vcvtpd2qq 2048(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x7b,0x82,0x00,0x08,0x00,0x00]
+          vcvtpd2qq 2048(%rdx), %xmm24
+
+// CHECK: vcvtpd2qq -2048(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x7b,0x42,0x80]
+          vcvtpd2qq -2048(%rdx), %xmm24
+
+// CHECK: vcvtpd2qq -2064(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x7b,0x82,0xf0,0xf7,0xff,0xff]
+          vcvtpd2qq -2064(%rdx), %xmm24
+
+// CHECK: vcvtpd2qq 1016(%rdx){1to2}, %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x7b,0x42,0x7f]
+          vcvtpd2qq 1016(%rdx){1to2}, %xmm24
+
+// CHECK: vcvtpd2qq 1024(%rdx){1to2}, %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x7b,0x82,0x00,0x04,0x00,0x00]
+          vcvtpd2qq 1024(%rdx){1to2}, %xmm24
+
+// CHECK: vcvtpd2qq -1024(%rdx){1to2}, %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x7b,0x42,0x80]
+          vcvtpd2qq -1024(%rdx){1to2}, %xmm24
+
+// CHECK: vcvtpd2qq -1032(%rdx){1to2}, %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x7b,0x82,0xf8,0xfb,0xff,0xff]
+          vcvtpd2qq -1032(%rdx){1to2}, %xmm24
+
+// CHECK: vcvtpd2qq %ymm27, %ymm24
+// CHECK:  encoding: [0x62,0x01,0xfd,0x28,0x7b,0xc3]
+          vcvtpd2qq %ymm27, %ymm24
+
+// CHECK: vcvtpd2qq %ymm27, %ymm24 {%k7}
+// CHECK:  encoding: [0x62,0x01,0xfd,0x2f,0x7b,0xc3]
+          vcvtpd2qq %ymm27, %ymm24 {%k7}
+
+// CHECK: vcvtpd2qq %ymm27, %ymm24 {%k7} {z}
+// CHECK:  encoding: [0x62,0x01,0xfd,0xaf,0x7b,0xc3]
+          vcvtpd2qq %ymm27, %ymm24 {%k7} {z}
+
+// CHECK: vcvtpd2qq (%rcx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x7b,0x01]
+          vcvtpd2qq (%rcx), %ymm24
+
+// CHECK: vcvtpd2qq 291(%rax,%r14,8), %ymm24
+// CHECK:  encoding: [0x62,0x21,0xfd,0x28,0x7b,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vcvtpd2qq 291(%rax,%r14,8), %ymm24
+
+// CHECK: vcvtpd2qq (%rcx){1to4}, %ymm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x38,0x7b,0x01]
+          vcvtpd2qq (%rcx){1to4}, %ymm24
+
+// CHECK: vcvtpd2qq 4064(%rdx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x7b,0x42,0x7f]
+          vcvtpd2qq 4064(%rdx), %ymm24
+
+// CHECK: vcvtpd2qq 4096(%rdx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x7b,0x82,0x00,0x10,0x00,0x00]
+          vcvtpd2qq 4096(%rdx), %ymm24
+
+// CHECK: vcvtpd2qq -4096(%rdx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x7b,0x42,0x80]
+          vcvtpd2qq -4096(%rdx), %ymm24
+
+// CHECK: vcvtpd2qq -4128(%rdx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x7b,0x82,0xe0,0xef,0xff,0xff]
+          vcvtpd2qq -4128(%rdx), %ymm24
+
+// CHECK: vcvtpd2qq 1016(%rdx){1to4}, %ymm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x38,0x7b,0x42,0x7f]
+          vcvtpd2qq 1016(%rdx){1to4}, %ymm24
+
+// CHECK: vcvtpd2qq 1024(%rdx){1to4}, %ymm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x38,0x7b,0x82,0x00,0x04,0x00,0x00]
+          vcvtpd2qq 1024(%rdx){1to4}, %ymm24
+
+// CHECK: vcvtpd2qq -1024(%rdx){1to4}, %ymm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x38,0x7b,0x42,0x80]
+          vcvtpd2qq -1024(%rdx){1to4}, %ymm24
+
+// CHECK: vcvtpd2qq -1032(%rdx){1to4}, %ymm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x38,0x7b,0x82,0xf8,0xfb,0xff,0xff]
+          vcvtpd2qq -1032(%rdx){1to4}, %ymm24
+
+// CHECK: vcvtpd2uqq %xmm20, %xmm22
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x08,0x79,0xf4]
+          vcvtpd2uqq %xmm20, %xmm22
+
+// CHECK: vcvtpd2uqq %xmm20, %xmm22 {%k3}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x0b,0x79,0xf4]
+          vcvtpd2uqq %xmm20, %xmm22 {%k3}
+
+// CHECK: vcvtpd2uqq %xmm20, %xmm22 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x8b,0x79,0xf4]
+          vcvtpd2uqq %xmm20, %xmm22 {%k3} {z}
+
+// CHECK: vcvtpd2uqq (%rcx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x79,0x31]
+          vcvtpd2uqq (%rcx), %xmm22
+
+// CHECK: vcvtpd2uqq 291(%rax,%r14,8), %xmm22
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x08,0x79,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtpd2uqq 291(%rax,%r14,8), %xmm22
+
+// CHECK: vcvtpd2uqq (%rcx){1to2}, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x18,0x79,0x31]
+          vcvtpd2uqq (%rcx){1to2}, %xmm22
+
+// CHECK: vcvtpd2uqq 2032(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x79,0x72,0x7f]
+          vcvtpd2uqq 2032(%rdx), %xmm22
+
+// CHECK: vcvtpd2uqq 2048(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x79,0xb2,0x00,0x08,0x00,0x00]
+          vcvtpd2uqq 2048(%rdx), %xmm22
+
+// CHECK: vcvtpd2uqq -2048(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x79,0x72,0x80]
+          vcvtpd2uqq -2048(%rdx), %xmm22
+
+// CHECK: vcvtpd2uqq -2064(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x79,0xb2,0xf0,0xf7,0xff,0xff]
+          vcvtpd2uqq -2064(%rdx), %xmm22
+
+// CHECK: vcvtpd2uqq 1016(%rdx){1to2}, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x18,0x79,0x72,0x7f]
+          vcvtpd2uqq 1016(%rdx){1to2}, %xmm22
+
+// CHECK: vcvtpd2uqq 1024(%rdx){1to2}, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x18,0x79,0xb2,0x00,0x04,0x00,0x00]
+          vcvtpd2uqq 1024(%rdx){1to2}, %xmm22
+
+// CHECK: vcvtpd2uqq -1024(%rdx){1to2}, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x18,0x79,0x72,0x80]
+          vcvtpd2uqq -1024(%rdx){1to2}, %xmm22
+
+// CHECK: vcvtpd2uqq -1032(%rdx){1to2}, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x18,0x79,0xb2,0xf8,0xfb,0xff,0xff]
+          vcvtpd2uqq -1032(%rdx){1to2}, %xmm22
+
+// CHECK: vcvtpd2uqq %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0x81,0xfd,0x28,0x79,0xe8]
+          vcvtpd2uqq %ymm24, %ymm21
+
+// CHECK: vcvtpd2uqq %ymm24, %ymm21 {%k6}
+// CHECK:  encoding: [0x62,0x81,0xfd,0x2e,0x79,0xe8]
+          vcvtpd2uqq %ymm24, %ymm21 {%k6}
+
+// CHECK: vcvtpd2uqq %ymm24, %ymm21 {%k6} {z}
+// CHECK:  encoding: [0x62,0x81,0xfd,0xae,0x79,0xe8]
+          vcvtpd2uqq %ymm24, %ymm21 {%k6} {z}
+
+// CHECK: vcvtpd2uqq (%rcx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x79,0x29]
+          vcvtpd2uqq (%rcx), %ymm21
+
+// CHECK: vcvtpd2uqq 291(%rax,%r14,8), %ymm21
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x28,0x79,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vcvtpd2uqq 291(%rax,%r14,8), %ymm21
+
+// CHECK: vcvtpd2uqq (%rcx){1to4}, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x79,0x29]
+          vcvtpd2uqq (%rcx){1to4}, %ymm21
+
+// CHECK: vcvtpd2uqq 4064(%rdx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x79,0x6a,0x7f]
+          vcvtpd2uqq 4064(%rdx), %ymm21
+
+// CHECK: vcvtpd2uqq 4096(%rdx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x79,0xaa,0x00,0x10,0x00,0x00]
+          vcvtpd2uqq 4096(%rdx), %ymm21
+
+// CHECK: vcvtpd2uqq -4096(%rdx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x79,0x6a,0x80]
+          vcvtpd2uqq -4096(%rdx), %ymm21
+
+// CHECK: vcvtpd2uqq -4128(%rdx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x79,0xaa,0xe0,0xef,0xff,0xff]
+          vcvtpd2uqq -4128(%rdx), %ymm21
+
+// CHECK: vcvtpd2uqq 1016(%rdx){1to4}, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x79,0x6a,0x7f]
+          vcvtpd2uqq 1016(%rdx){1to4}, %ymm21
+
+// CHECK: vcvtpd2uqq 1024(%rdx){1to4}, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x79,0xaa,0x00,0x04,0x00,0x00]
+          vcvtpd2uqq 1024(%rdx){1to4}, %ymm21
+
+// CHECK: vcvtpd2uqq -1024(%rdx){1to4}, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x79,0x6a,0x80]
+          vcvtpd2uqq -1024(%rdx){1to4}, %ymm21
+
+// CHECK: vcvtpd2uqq -1032(%rdx){1to4}, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x79,0xaa,0xf8,0xfb,0xff,0xff]
+          vcvtpd2uqq -1032(%rdx){1to4}, %ymm21
+
+// CHECK: vcvtps2qq %xmm28, %xmm17
+// CHECK:  encoding: [0x62,0x81,0x7d,0x08,0x7b,0xcc]
+          vcvtps2qq %xmm28, %xmm17
+
+// CHECK: vcvtps2qq %xmm28, %xmm17 {%k4}
+// CHECK:  encoding: [0x62,0x81,0x7d,0x0c,0x7b,0xcc]
+          vcvtps2qq %xmm28, %xmm17 {%k4}
+
+// CHECK: vcvtps2qq %xmm28, %xmm17 {%k4} {z}
+// CHECK:  encoding: [0x62,0x81,0x7d,0x8c,0x7b,0xcc]
+          vcvtps2qq %xmm28, %xmm17 {%k4} {z}
+
+// CHECK: vcvtps2qq (%rcx), %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0x7b,0x09]
+          vcvtps2qq (%rcx), %xmm17
+
+// CHECK: vcvtps2qq 291(%rax,%r14,8), %xmm17
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x08,0x7b,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vcvtps2qq 291(%rax,%r14,8), %xmm17
+
+// CHECK: vcvtps2qq (%rcx){1to2}, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x18,0x7b,0x09]
+          vcvtps2qq (%rcx){1to2}, %xmm17
+
+// CHECK: vcvtps2qq 1016(%rdx), %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0x7b,0x4a,0x7f]
+          vcvtps2qq 1016(%rdx), %xmm17
+
+// CHECK: vcvtps2qq 1024(%rdx), %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0x7b,0x8a,0x00,0x04,0x00,0x00]
+          vcvtps2qq 1024(%rdx), %xmm17
+
+// CHECK: vcvtps2qq -1024(%rdx), %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0x7b,0x4a,0x80]
+          vcvtps2qq -1024(%rdx), %xmm17
+
+// CHECK: vcvtps2qq -1032(%rdx), %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0x7b,0x8a,0xf8,0xfb,0xff,0xff]
+          vcvtps2qq -1032(%rdx), %xmm17
+
+// CHECK: vcvtps2qq 508(%rdx){1to2}, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x18,0x7b,0x4a,0x7f]
+          vcvtps2qq 508(%rdx){1to2}, %xmm17
+
+// CHECK: vcvtps2qq 512(%rdx){1to2}, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x18,0x7b,0x8a,0x00,0x02,0x00,0x00]
+          vcvtps2qq 512(%rdx){1to2}, %xmm17
+
+// CHECK: vcvtps2qq -512(%rdx){1to2}, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x18,0x7b,0x4a,0x80]
+          vcvtps2qq -512(%rdx){1to2}, %xmm17
+
+// CHECK: vcvtps2qq -516(%rdx){1to2}, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x18,0x7b,0x8a,0xfc,0xfd,0xff,0xff]
+          vcvtps2qq -516(%rdx){1to2}, %xmm17
+
+// CHECK: vcvtps2qq %xmm27, %ymm25
+// CHECK:  encoding: [0x62,0x01,0x7d,0x28,0x7b,0xcb]
+          vcvtps2qq %xmm27, %ymm25
+
+// CHECK: vcvtps2qq %xmm27, %ymm25 {%k7}
+// CHECK:  encoding: [0x62,0x01,0x7d,0x2f,0x7b,0xcb]
+          vcvtps2qq %xmm27, %ymm25 {%k7}
+
+// CHECK: vcvtps2qq %xmm27, %ymm25 {%k7} {z}
+// CHECK:  encoding: [0x62,0x01,0x7d,0xaf,0x7b,0xcb]
+          vcvtps2qq %xmm27, %ymm25 {%k7} {z}
+
+// CHECK: vcvtps2qq (%rcx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x7b,0x09]
+          vcvtps2qq (%rcx), %ymm25
+
+// CHECK: vcvtps2qq 291(%rax,%r14,8), %ymm25
+// CHECK:  encoding: [0x62,0x21,0x7d,0x28,0x7b,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vcvtps2qq 291(%rax,%r14,8), %ymm25
+
+// CHECK: vcvtps2qq (%rcx){1to4}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x38,0x7b,0x09]
+          vcvtps2qq (%rcx){1to4}, %ymm25
+
+// CHECK: vcvtps2qq 2032(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x7b,0x4a,0x7f]
+          vcvtps2qq 2032(%rdx), %ymm25
+
+// CHECK: vcvtps2qq 2048(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x7b,0x8a,0x00,0x08,0x00,0x00]
+          vcvtps2qq 2048(%rdx), %ymm25
+
+// CHECK: vcvtps2qq -2048(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x7b,0x4a,0x80]
+          vcvtps2qq -2048(%rdx), %ymm25
+
+// CHECK: vcvtps2qq -2064(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x7b,0x8a,0xf0,0xf7,0xff,0xff]
+          vcvtps2qq -2064(%rdx), %ymm25
+
+// CHECK: vcvtps2qq 508(%rdx){1to4}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x38,0x7b,0x4a,0x7f]
+          vcvtps2qq 508(%rdx){1to4}, %ymm25
+
+// CHECK: vcvtps2qq 512(%rdx){1to4}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x38,0x7b,0x8a,0x00,0x02,0x00,0x00]
+          vcvtps2qq 512(%rdx){1to4}, %ymm25
+
+// CHECK: vcvtps2qq -512(%rdx){1to4}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x38,0x7b,0x4a,0x80]
+          vcvtps2qq -512(%rdx){1to4}, %ymm25
+
+// CHECK: vcvtps2qq -516(%rdx){1to4}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7d,0x38,0x7b,0x8a,0xfc,0xfd,0xff,0xff]
+          vcvtps2qq -516(%rdx){1to4}, %ymm25
+
+// CHECK: vcvtps2uqq %xmm29, %xmm29
+// CHECK:  encoding: [0x62,0x01,0x7d,0x08,0x79,0xed]
+          vcvtps2uqq %xmm29, %xmm29
+
+// CHECK: vcvtps2uqq %xmm29, %xmm29 {%k1}
+// CHECK:  encoding: [0x62,0x01,0x7d,0x09,0x79,0xed]
+          vcvtps2uqq %xmm29, %xmm29 {%k1}
+
+// CHECK: vcvtps2uqq %xmm29, %xmm29 {%k1} {z}
+// CHECK:  encoding: [0x62,0x01,0x7d,0x89,0x79,0xed]
+          vcvtps2uqq %xmm29, %xmm29 {%k1} {z}
+
+// CHECK: vcvtps2uqq (%rcx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7d,0x08,0x79,0x29]
+          vcvtps2uqq (%rcx), %xmm29
+
+// CHECK: vcvtps2uqq 291(%rax,%r14,8), %xmm29
+// CHECK:  encoding: [0x62,0x21,0x7d,0x08,0x79,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vcvtps2uqq 291(%rax,%r14,8), %xmm29
+
+// CHECK: vcvtps2uqq (%rcx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7d,0x18,0x79,0x29]
+          vcvtps2uqq (%rcx){1to2}, %xmm29
+
+// CHECK: vcvtps2uqq 1016(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7d,0x08,0x79,0x6a,0x7f]
+          vcvtps2uqq 1016(%rdx), %xmm29
+
+// CHECK: vcvtps2uqq 1024(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7d,0x08,0x79,0xaa,0x00,0x04,0x00,0x00]
+          vcvtps2uqq 1024(%rdx), %xmm29
+
+// CHECK: vcvtps2uqq -1024(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7d,0x08,0x79,0x6a,0x80]
+          vcvtps2uqq -1024(%rdx), %xmm29
+
+// CHECK: vcvtps2uqq -1032(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7d,0x08,0x79,0xaa,0xf8,0xfb,0xff,0xff]
+          vcvtps2uqq -1032(%rdx), %xmm29
+
+// CHECK: vcvtps2uqq 508(%rdx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7d,0x18,0x79,0x6a,0x7f]
+          vcvtps2uqq 508(%rdx){1to2}, %xmm29
+
+// CHECK: vcvtps2uqq 512(%rdx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7d,0x18,0x79,0xaa,0x00,0x02,0x00,0x00]
+          vcvtps2uqq 512(%rdx){1to2}, %xmm29
+
+// CHECK: vcvtps2uqq -512(%rdx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7d,0x18,0x79,0x6a,0x80]
+          vcvtps2uqq -512(%rdx){1to2}, %xmm29
+
+// CHECK: vcvtps2uqq -516(%rdx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7d,0x18,0x79,0xaa,0xfc,0xfd,0xff,0xff]
+          vcvtps2uqq -516(%rdx){1to2}, %xmm29
+
+// CHECK: vcvtps2uqq %xmm19, %ymm23
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x28,0x79,0xfb]
+          vcvtps2uqq %xmm19, %ymm23
+
+// CHECK: vcvtps2uqq %xmm19, %ymm23 {%k2}
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x2a,0x79,0xfb]
+          vcvtps2uqq %xmm19, %ymm23 {%k2}
+
+// CHECK: vcvtps2uqq %xmm19, %ymm23 {%k2} {z}
+// CHECK:  encoding: [0x62,0xa1,0x7d,0xaa,0x79,0xfb]
+          vcvtps2uqq %xmm19, %ymm23 {%k2} {z}
+
+// CHECK: vcvtps2uqq (%rcx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x28,0x79,0x39]
+          vcvtps2uqq (%rcx), %ymm23
+
+// CHECK: vcvtps2uqq 291(%rax,%r14,8), %ymm23
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x28,0x79,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vcvtps2uqq 291(%rax,%r14,8), %ymm23
+
+// CHECK: vcvtps2uqq (%rcx){1to4}, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x38,0x79,0x39]
+          vcvtps2uqq (%rcx){1to4}, %ymm23
+
+// CHECK: vcvtps2uqq 2032(%rdx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x28,0x79,0x7a,0x7f]
+          vcvtps2uqq 2032(%rdx), %ymm23
+
+// CHECK: vcvtps2uqq 2048(%rdx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x28,0x79,0xba,0x00,0x08,0x00,0x00]
+          vcvtps2uqq 2048(%rdx), %ymm23
+
+// CHECK: vcvtps2uqq -2048(%rdx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x28,0x79,0x7a,0x80]
+          vcvtps2uqq -2048(%rdx), %ymm23
+
+// CHECK: vcvtps2uqq -2064(%rdx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x28,0x79,0xba,0xf0,0xf7,0xff,0xff]
+          vcvtps2uqq -2064(%rdx), %ymm23
+
+// CHECK: vcvtps2uqq 508(%rdx){1to4}, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x38,0x79,0x7a,0x7f]
+          vcvtps2uqq 508(%rdx){1to4}, %ymm23
+
+// CHECK: vcvtps2uqq 512(%rdx){1to4}, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x38,0x79,0xba,0x00,0x02,0x00,0x00]
+          vcvtps2uqq 512(%rdx){1to4}, %ymm23
+
+// CHECK: vcvtps2uqq -512(%rdx){1to4}, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x38,0x79,0x7a,0x80]
+          vcvtps2uqq -512(%rdx){1to4}, %ymm23
+
+// CHECK: vcvtps2uqq -516(%rdx){1to4}, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x38,0x79,0xba,0xfc,0xfd,0xff,0xff]
+          vcvtps2uqq -516(%rdx){1to4}, %ymm23
+
+// CHECK: vcvtqq2pd %xmm29, %xmm22
+// CHECK:  encoding: [0x62,0x81,0xfe,0x08,0xe6,0xf5]
+          vcvtqq2pd %xmm29, %xmm22
+
+// CHECK: vcvtqq2pd %xmm29, %xmm22 {%k7}
+// CHECK:  encoding: [0x62,0x81,0xfe,0x0f,0xe6,0xf5]
+          vcvtqq2pd %xmm29, %xmm22 {%k7}
+
+// CHECK: vcvtqq2pd %xmm29, %xmm22 {%k7} {z}
+// CHECK:  encoding: [0x62,0x81,0xfe,0x8f,0xe6,0xf5]
+          vcvtqq2pd %xmm29, %xmm22 {%k7} {z}
+
+// CHECK: vcvtqq2pd (%rcx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0xe6,0x31]
+          vcvtqq2pd (%rcx), %xmm22
+
+// CHECK: vcvtqq2pd 291(%rax,%r14,8), %xmm22
+// CHECK:  encoding: [0x62,0xa1,0xfe,0x08,0xe6,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtqq2pd 291(%rax,%r14,8), %xmm22
+
+// CHECK: vcvtqq2pd (%rcx){1to2}, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x18,0xe6,0x31]
+          vcvtqq2pd (%rcx){1to2}, %xmm22
+
+// CHECK: vcvtqq2pd 2032(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0xe6,0x72,0x7f]
+          vcvtqq2pd 2032(%rdx), %xmm22
+
+// CHECK: vcvtqq2pd 2048(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0xe6,0xb2,0x00,0x08,0x00,0x00]
+          vcvtqq2pd 2048(%rdx), %xmm22
+
+// CHECK: vcvtqq2pd -2048(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0xe6,0x72,0x80]
+          vcvtqq2pd -2048(%rdx), %xmm22
+
+// CHECK: vcvtqq2pd -2064(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0xe6,0xb2,0xf0,0xf7,0xff,0xff]
+          vcvtqq2pd -2064(%rdx), %xmm22
+
+// CHECK: vcvtqq2pd 1016(%rdx){1to2}, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x18,0xe6,0x72,0x7f]
+          vcvtqq2pd 1016(%rdx){1to2}, %xmm22
+
+// CHECK: vcvtqq2pd 1024(%rdx){1to2}, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x18,0xe6,0xb2,0x00,0x04,0x00,0x00]
+          vcvtqq2pd 1024(%rdx){1to2}, %xmm22
+
+// CHECK: vcvtqq2pd -1024(%rdx){1to2}, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x18,0xe6,0x72,0x80]
+          vcvtqq2pd -1024(%rdx){1to2}, %xmm22
+
+// CHECK: vcvtqq2pd -1032(%rdx){1to2}, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x18,0xe6,0xb2,0xf8,0xfb,0xff,0xff]
+          vcvtqq2pd -1032(%rdx){1to2}, %xmm22
+
+// CHECK: vcvtqq2pd %ymm20, %ymm21
+// CHECK:  encoding: [0x62,0xa1,0xfe,0x28,0xe6,0xec]
+          vcvtqq2pd %ymm20, %ymm21
+
+// CHECK: vcvtqq2pd %ymm20, %ymm21 {%k5}
+// CHECK:  encoding: [0x62,0xa1,0xfe,0x2d,0xe6,0xec]
+          vcvtqq2pd %ymm20, %ymm21 {%k5}
+
+// CHECK: vcvtqq2pd %ymm20, %ymm21 {%k5} {z}
+// CHECK:  encoding: [0x62,0xa1,0xfe,0xad,0xe6,0xec]
+          vcvtqq2pd %ymm20, %ymm21 {%k5} {z}
+
+// CHECK: vcvtqq2pd (%rcx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x28,0xe6,0x29]
+          vcvtqq2pd (%rcx), %ymm21
+
+// CHECK: vcvtqq2pd 291(%rax,%r14,8), %ymm21
+// CHECK:  encoding: [0x62,0xa1,0xfe,0x28,0xe6,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vcvtqq2pd 291(%rax,%r14,8), %ymm21
+
+// CHECK: vcvtqq2pd (%rcx){1to4}, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x38,0xe6,0x29]
+          vcvtqq2pd (%rcx){1to4}, %ymm21
+
+// CHECK: vcvtqq2pd 4064(%rdx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x28,0xe6,0x6a,0x7f]
+          vcvtqq2pd 4064(%rdx), %ymm21
+
+// CHECK: vcvtqq2pd 4096(%rdx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x28,0xe6,0xaa,0x00,0x10,0x00,0x00]
+          vcvtqq2pd 4096(%rdx), %ymm21
+
+// CHECK: vcvtqq2pd -4096(%rdx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x28,0xe6,0x6a,0x80]
+          vcvtqq2pd -4096(%rdx), %ymm21
+
+// CHECK: vcvtqq2pd -4128(%rdx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x28,0xe6,0xaa,0xe0,0xef,0xff,0xff]
+          vcvtqq2pd -4128(%rdx), %ymm21
+
+// CHECK: vcvtqq2pd 1016(%rdx){1to4}, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x38,0xe6,0x6a,0x7f]
+          vcvtqq2pd 1016(%rdx){1to4}, %ymm21
+
+// CHECK: vcvtqq2pd 1024(%rdx){1to4}, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x38,0xe6,0xaa,0x00,0x04,0x00,0x00]
+          vcvtqq2pd 1024(%rdx){1to4}, %ymm21
+
+// CHECK: vcvtqq2pd -1024(%rdx){1to4}, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x38,0xe6,0x6a,0x80]
+          vcvtqq2pd -1024(%rdx){1to4}, %ymm21
+
+// CHECK: vcvtqq2pd -1032(%rdx){1to4}, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x38,0xe6,0xaa,0xf8,0xfb,0xff,0xff]
+          vcvtqq2pd -1032(%rdx){1to4}, %ymm21
+
+// CHECK: vcvtqq2ps %xmm28, %xmm25
+// CHECK:  encoding: [0x62,0x01,0xfc,0x08,0x5b,0xcc]
+          vcvtqq2ps %xmm28, %xmm25
+
+// CHECK: vcvtqq2ps %xmm28, %xmm25 {%k6}
+// CHECK:  encoding: [0x62,0x01,0xfc,0x0e,0x5b,0xcc]
+          vcvtqq2ps %xmm28, %xmm25 {%k6}
+
+// CHECK: vcvtqq2ps %xmm28, %xmm25 {%k6} {z}
+// CHECK:  encoding: [0x62,0x01,0xfc,0x8e,0x5b,0xcc]
+          vcvtqq2ps %xmm28, %xmm25 {%k6} {z}
+
+// CHECK: vcvtqq2psx (%rcx), %xmm25
+// CHECK:  encoding: [0x62,0x61,0xfc,0x08,0x5b,0x09]
+          vcvtqq2psx (%rcx), %xmm25
+
+// CHECK: vcvtqq2psx 291(%rax,%r14,8), %xmm25
+// CHECK:  encoding: [0x62,0x21,0xfc,0x08,0x5b,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vcvtqq2psx 291(%rax,%r14,8), %xmm25
+
+// CHECK: vcvtqq2ps (%rcx){1to2}, %xmm25
+// CHECK:  encoding: [0x62,0x61,0xfc,0x18,0x5b,0x09]
+          vcvtqq2ps (%rcx){1to2}, %xmm25
+
+// CHECK: vcvtqq2psx 2032(%rdx), %xmm25
+// CHECK:  encoding: [0x62,0x61,0xfc,0x08,0x5b,0x4a,0x7f]
+          vcvtqq2psx 2032(%rdx), %xmm25
+
+// CHECK: vcvtqq2psx 2048(%rdx), %xmm25
+// CHECK:  encoding: [0x62,0x61,0xfc,0x08,0x5b,0x8a,0x00,0x08,0x00,0x00]
+          vcvtqq2psx 2048(%rdx), %xmm25
+
+// CHECK: vcvtqq2psx -2048(%rdx), %xmm25
+// CHECK:  encoding: [0x62,0x61,0xfc,0x08,0x5b,0x4a,0x80]
+          vcvtqq2psx -2048(%rdx), %xmm25
+
+// CHECK: vcvtqq2psx -2064(%rdx), %xmm25
+// CHECK:  encoding: [0x62,0x61,0xfc,0x08,0x5b,0x8a,0xf0,0xf7,0xff,0xff]
+          vcvtqq2psx -2064(%rdx), %xmm25
+
+// CHECK: vcvtqq2ps 1016(%rdx){1to2}, %xmm25
+// CHECK:  encoding: [0x62,0x61,0xfc,0x18,0x5b,0x4a,0x7f]
+          vcvtqq2ps 1016(%rdx){1to2}, %xmm25
+
+// CHECK: vcvtqq2ps 1024(%rdx){1to2}, %xmm25
+// CHECK:  encoding: [0x62,0x61,0xfc,0x18,0x5b,0x8a,0x00,0x04,0x00,0x00]
+          vcvtqq2ps 1024(%rdx){1to2}, %xmm25
+
+// CHECK: vcvtqq2ps -1024(%rdx){1to2}, %xmm25
+// CHECK:  encoding: [0x62,0x61,0xfc,0x18,0x5b,0x4a,0x80]
+          vcvtqq2ps -1024(%rdx){1to2}, %xmm25
+
+// CHECK: vcvtqq2ps -1032(%rdx){1to2}, %xmm25
+// CHECK:  encoding: [0x62,0x61,0xfc,0x18,0x5b,0x8a,0xf8,0xfb,0xff,0xff]
+          vcvtqq2ps -1032(%rdx){1to2}, %xmm25
+
+// CHECK: vcvtqq2ps %ymm22, %xmm27
+// CHECK:  encoding: [0x62,0x21,0xfc,0x28,0x5b,0xde]
+          vcvtqq2ps %ymm22, %xmm27
+
+// CHECK: vcvtqq2ps %ymm22, %xmm27 {%k3}
+// CHECK:  encoding: [0x62,0x21,0xfc,0x2b,0x5b,0xde]
+          vcvtqq2ps %ymm22, %xmm27 {%k3}
+
+// CHECK: vcvtqq2ps %ymm22, %xmm27 {%k3} {z}
+// CHECK:  encoding: [0x62,0x21,0xfc,0xab,0x5b,0xde]
+          vcvtqq2ps %ymm22, %xmm27 {%k3} {z}
+
+// CHECK: vcvtqq2psy (%rcx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfc,0x28,0x5b,0x19]
+          vcvtqq2psy (%rcx), %xmm27
+
+// CHECK: vcvtqq2psy 291(%rax,%r14,8), %xmm27
+// CHECK:  encoding: [0x62,0x21,0xfc,0x28,0x5b,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vcvtqq2psy 291(%rax,%r14,8), %xmm27
+
+// CHECK: vcvtqq2ps (%rcx){1to4}, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfc,0x38,0x5b,0x19]
+          vcvtqq2ps (%rcx){1to4}, %xmm27
+
+// CHECK: vcvtqq2psy 4064(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfc,0x28,0x5b,0x5a,0x7f]
+          vcvtqq2psy 4064(%rdx), %xmm27
+
+// CHECK: vcvtqq2psy 4096(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfc,0x28,0x5b,0x9a,0x00,0x10,0x00,0x00]
+          vcvtqq2psy 4096(%rdx), %xmm27
+
+// CHECK: vcvtqq2psy -4096(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfc,0x28,0x5b,0x5a,0x80]
+          vcvtqq2psy -4096(%rdx), %xmm27
+
+// CHECK: vcvtqq2psy -4128(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfc,0x28,0x5b,0x9a,0xe0,0xef,0xff,0xff]
+          vcvtqq2psy -4128(%rdx), %xmm27
+
+// CHECK: vcvtqq2ps 1016(%rdx){1to4}, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfc,0x38,0x5b,0x5a,0x7f]
+          vcvtqq2ps 1016(%rdx){1to4}, %xmm27
+
+// CHECK: vcvtqq2ps 1024(%rdx){1to4}, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfc,0x38,0x5b,0x9a,0x00,0x04,0x00,0x00]
+          vcvtqq2ps 1024(%rdx){1to4}, %xmm27
+
+// CHECK: vcvtqq2ps -1024(%rdx){1to4}, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfc,0x38,0x5b,0x5a,0x80]
+          vcvtqq2ps -1024(%rdx){1to4}, %xmm27
+
+// CHECK: vcvtqq2ps -1032(%rdx){1to4}, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfc,0x38,0x5b,0x9a,0xf8,0xfb,0xff,0xff]
+          vcvtqq2ps -1032(%rdx){1to4}, %xmm27
+
+// CHECK: vcvtuqq2pd %xmm20, %xmm19
+// CHECK:  encoding: [0x62,0xa1,0xfe,0x08,0x7a,0xdc]
+          vcvtuqq2pd %xmm20, %xmm19
+
+// CHECK: vcvtuqq2pd %xmm20, %xmm19 {%k3}
+// CHECK:  encoding: [0x62,0xa1,0xfe,0x0b,0x7a,0xdc]
+          vcvtuqq2pd %xmm20, %xmm19 {%k3}
+
+// CHECK: vcvtuqq2pd %xmm20, %xmm19 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0xfe,0x8b,0x7a,0xdc]
+          vcvtuqq2pd %xmm20, %xmm19 {%k3} {z}
+
+// CHECK: vcvtuqq2pd (%rcx), %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0x7a,0x19]
+          vcvtuqq2pd (%rcx), %xmm19
+
+// CHECK: vcvtuqq2pd 291(%rax,%r14,8), %xmm19
+// CHECK:  encoding: [0x62,0xa1,0xfe,0x08,0x7a,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vcvtuqq2pd 291(%rax,%r14,8), %xmm19
+
+// CHECK: vcvtuqq2pd (%rcx){1to2}, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x18,0x7a,0x19]
+          vcvtuqq2pd (%rcx){1to2}, %xmm19
+
+// CHECK: vcvtuqq2pd 2032(%rdx), %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0x7a,0x5a,0x7f]
+          vcvtuqq2pd 2032(%rdx), %xmm19
+
+// CHECK: vcvtuqq2pd 2048(%rdx), %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0x7a,0x9a,0x00,0x08,0x00,0x00]
+          vcvtuqq2pd 2048(%rdx), %xmm19
+
+// CHECK: vcvtuqq2pd -2048(%rdx), %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0x7a,0x5a,0x80]
+          vcvtuqq2pd -2048(%rdx), %xmm19
+
+// CHECK: vcvtuqq2pd -2064(%rdx), %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0x7a,0x9a,0xf0,0xf7,0xff,0xff]
+          vcvtuqq2pd -2064(%rdx), %xmm19
+
+// CHECK: vcvtuqq2pd 1016(%rdx){1to2}, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x18,0x7a,0x5a,0x7f]
+          vcvtuqq2pd 1016(%rdx){1to2}, %xmm19
+
+// CHECK: vcvtuqq2pd 1024(%rdx){1to2}, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x18,0x7a,0x9a,0x00,0x04,0x00,0x00]
+          vcvtuqq2pd 1024(%rdx){1to2}, %xmm19
+
+// CHECK: vcvtuqq2pd -1024(%rdx){1to2}, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x18,0x7a,0x5a,0x80]
+          vcvtuqq2pd -1024(%rdx){1to2}, %xmm19
+
+// CHECK: vcvtuqq2pd -1032(%rdx){1to2}, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x18,0x7a,0x9a,0xf8,0xfb,0xff,0xff]
+          vcvtuqq2pd -1032(%rdx){1to2}, %xmm19
+
+// CHECK: vcvtuqq2pd %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x01,0xfe,0x28,0x7a,0xe2]
+          vcvtuqq2pd %ymm26, %ymm28
+
+// CHECK: vcvtuqq2pd %ymm26, %ymm28 {%k4}
+// CHECK:  encoding: [0x62,0x01,0xfe,0x2c,0x7a,0xe2]
+          vcvtuqq2pd %ymm26, %ymm28 {%k4}
+
+// CHECK: vcvtuqq2pd %ymm26, %ymm28 {%k4} {z}
+// CHECK:  encoding: [0x62,0x01,0xfe,0xac,0x7a,0xe2]
+          vcvtuqq2pd %ymm26, %ymm28 {%k4} {z}
+
+// CHECK: vcvtuqq2pd (%rcx), %ymm28
+// CHECK:  encoding: [0x62,0x61,0xfe,0x28,0x7a,0x21]
+          vcvtuqq2pd (%rcx), %ymm28
+
+// CHECK: vcvtuqq2pd 291(%rax,%r14,8), %ymm28
+// CHECK:  encoding: [0x62,0x21,0xfe,0x28,0x7a,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtuqq2pd 291(%rax,%r14,8), %ymm28
+
+// CHECK: vcvtuqq2pd (%rcx){1to4}, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xfe,0x38,0x7a,0x21]
+          vcvtuqq2pd (%rcx){1to4}, %ymm28
+
+// CHECK: vcvtuqq2pd 4064(%rdx), %ymm28
+// CHECK:  encoding: [0x62,0x61,0xfe,0x28,0x7a,0x62,0x7f]
+          vcvtuqq2pd 4064(%rdx), %ymm28
+
+// CHECK: vcvtuqq2pd 4096(%rdx), %ymm28
+// CHECK:  encoding: [0x62,0x61,0xfe,0x28,0x7a,0xa2,0x00,0x10,0x00,0x00]
+          vcvtuqq2pd 4096(%rdx), %ymm28
+
+// CHECK: vcvtuqq2pd -4096(%rdx), %ymm28
+// CHECK:  encoding: [0x62,0x61,0xfe,0x28,0x7a,0x62,0x80]
+          vcvtuqq2pd -4096(%rdx), %ymm28
+
+// CHECK: vcvtuqq2pd -4128(%rdx), %ymm28
+// CHECK:  encoding: [0x62,0x61,0xfe,0x28,0x7a,0xa2,0xe0,0xef,0xff,0xff]
+          vcvtuqq2pd -4128(%rdx), %ymm28
+
+// CHECK: vcvtuqq2pd 1016(%rdx){1to4}, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xfe,0x38,0x7a,0x62,0x7f]
+          vcvtuqq2pd 1016(%rdx){1to4}, %ymm28
+
+// CHECK: vcvtuqq2pd 1024(%rdx){1to4}, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xfe,0x38,0x7a,0xa2,0x00,0x04,0x00,0x00]
+          vcvtuqq2pd 1024(%rdx){1to4}, %ymm28
+
+// CHECK: vcvtuqq2pd -1024(%rdx){1to4}, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xfe,0x38,0x7a,0x62,0x80]
+          vcvtuqq2pd -1024(%rdx){1to4}, %ymm28
+
+// CHECK: vcvtuqq2pd -1032(%rdx){1to4}, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xfe,0x38,0x7a,0xa2,0xf8,0xfb,0xff,0xff]
+          vcvtuqq2pd -1032(%rdx){1to4}, %ymm28
+
+// CHECK: vcvtuqq2ps %xmm27, %xmm21
+// CHECK:  encoding: [0x62,0x81,0xff,0x08,0x7a,0xeb]
+          vcvtuqq2ps %xmm27, %xmm21
+
+// CHECK: vcvtuqq2ps %xmm27, %xmm21 {%k7}
+// CHECK:  encoding: [0x62,0x81,0xff,0x0f,0x7a,0xeb]
+          vcvtuqq2ps %xmm27, %xmm21 {%k7}
+
+// CHECK: vcvtuqq2ps %xmm27, %xmm21 {%k7} {z}
+// CHECK:  encoding: [0x62,0x81,0xff,0x8f,0x7a,0xeb]
+          vcvtuqq2ps %xmm27, %xmm21 {%k7} {z}
+
+// CHECK: vcvtuqq2psx (%rcx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xff,0x08,0x7a,0x29]
+          vcvtuqq2psx (%rcx), %xmm21
+
+// CHECK: vcvtuqq2psx 291(%rax,%r14,8), %xmm21
+// CHECK:  encoding: [0x62,0xa1,0xff,0x08,0x7a,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vcvtuqq2psx 291(%rax,%r14,8), %xmm21
+
+// CHECK: vcvtuqq2ps (%rcx){1to2}, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xff,0x18,0x7a,0x29]
+          vcvtuqq2ps (%rcx){1to2}, %xmm21
+
+// CHECK: vcvtuqq2psx 2032(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xff,0x08,0x7a,0x6a,0x7f]
+          vcvtuqq2psx 2032(%rdx), %xmm21
+
+// CHECK: vcvtuqq2psx 2048(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xff,0x08,0x7a,0xaa,0x00,0x08,0x00,0x00]
+          vcvtuqq2psx 2048(%rdx), %xmm21
+
+// CHECK: vcvtuqq2psx -2048(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xff,0x08,0x7a,0x6a,0x80]
+          vcvtuqq2psx -2048(%rdx), %xmm21
+
+// CHECK: vcvtuqq2psx -2064(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xff,0x08,0x7a,0xaa,0xf0,0xf7,0xff,0xff]
+          vcvtuqq2psx -2064(%rdx), %xmm21
+
+// CHECK: vcvtuqq2ps 1016(%rdx){1to2}, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xff,0x18,0x7a,0x6a,0x7f]
+          vcvtuqq2ps 1016(%rdx){1to2}, %xmm21
+
+// CHECK: vcvtuqq2ps 1024(%rdx){1to2}, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xff,0x18,0x7a,0xaa,0x00,0x04,0x00,0x00]
+          vcvtuqq2ps 1024(%rdx){1to2}, %xmm21
+
+// CHECK: vcvtuqq2ps -1024(%rdx){1to2}, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xff,0x18,0x7a,0x6a,0x80]
+          vcvtuqq2ps -1024(%rdx){1to2}, %xmm21
+
+// CHECK: vcvtuqq2ps -1032(%rdx){1to2}, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xff,0x18,0x7a,0xaa,0xf8,0xfb,0xff,0xff]
+          vcvtuqq2ps -1032(%rdx){1to2}, %xmm21
+
+// CHECK: vcvtuqq2ps %ymm24, %xmm28
+// CHECK:  encoding: [0x62,0x01,0xff,0x28,0x7a,0xe0]
+          vcvtuqq2ps %ymm24, %xmm28
+
+// CHECK: vcvtuqq2ps %ymm24, %xmm28 {%k3}
+// CHECK:  encoding: [0x62,0x01,0xff,0x2b,0x7a,0xe0]
+          vcvtuqq2ps %ymm24, %xmm28 {%k3}
+
+// CHECK: vcvtuqq2ps %ymm24, %xmm28 {%k3} {z}
+// CHECK:  encoding: [0x62,0x01,0xff,0xab,0x7a,0xe0]
+          vcvtuqq2ps %ymm24, %xmm28 {%k3} {z}
+
+// CHECK: vcvtuqq2psy (%rcx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0xff,0x28,0x7a,0x21]
+          vcvtuqq2psy (%rcx), %xmm28
+
+// CHECK: vcvtuqq2psy 291(%rax,%r14,8), %xmm28
+// CHECK:  encoding: [0x62,0x21,0xff,0x28,0x7a,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtuqq2psy 291(%rax,%r14,8), %xmm28
+
+// CHECK: vcvtuqq2ps (%rcx){1to4}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xff,0x38,0x7a,0x21]
+          vcvtuqq2ps (%rcx){1to4}, %xmm28
+
+// CHECK: vcvtuqq2psy 4064(%rdx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0xff,0x28,0x7a,0x62,0x7f]
+          vcvtuqq2psy 4064(%rdx), %xmm28
+
+// CHECK: vcvtuqq2psy 4096(%rdx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0xff,0x28,0x7a,0xa2,0x00,0x10,0x00,0x00]
+          vcvtuqq2psy 4096(%rdx), %xmm28
+
+// CHECK: vcvtuqq2psy -4096(%rdx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0xff,0x28,0x7a,0x62,0x80]
+          vcvtuqq2psy -4096(%rdx), %xmm28
+
+// CHECK: vcvtuqq2psy -4128(%rdx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0xff,0x28,0x7a,0xa2,0xe0,0xef,0xff,0xff]
+          vcvtuqq2psy -4128(%rdx), %xmm28
+
+// CHECK: vcvtuqq2ps 1016(%rdx){1to4}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xff,0x38,0x7a,0x62,0x7f]
+          vcvtuqq2ps 1016(%rdx){1to4}, %xmm28
+
+// CHECK: vcvtuqq2ps 1024(%rdx){1to4}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xff,0x38,0x7a,0xa2,0x00,0x04,0x00,0x00]
+          vcvtuqq2ps 1024(%rdx){1to4}, %xmm28
+
+// CHECK: vcvtuqq2ps -1024(%rdx){1to4}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xff,0x38,0x7a,0x62,0x80]
+          vcvtuqq2ps -1024(%rdx){1to4}, %xmm28
+
+// CHECK: vcvtuqq2ps -1032(%rdx){1to4}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xff,0x38,0x7a,0xa2,0xf8,0xfb,0xff,0xff]
+          vcvtuqq2ps -1032(%rdx){1to4}, %xmm28
+
diff --git a/test/MC/X86/x86-64-avx512f_vl.s b/test/MC/X86/x86-64-avx512f_vl.s
index c587f8a75aaa..c746e6627f7a 100644
--- a/test/MC/X86/x86-64-avx512f_vl.s
+++ b/test/MC/X86/x86-64-avx512f_vl.s
@@ -16284,3 +16284,1122 @@ vaddpd  {rz-sae}, %zmm2, %zmm1, %zmm1
 // CHECK: vscalefps -516(%rdx){1to8}, %ymm22, %ymm25
 // CHECK:  encoding: [0x62,0x62,0x4d,0x30,0x2c,0x8a,0xfc,0xfd,0xff,0xff]
           vscalefps -516(%rdx){1to8}, %ymm22, %ymm25
+
+// CHECK: vcvtps2pd %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0x81,0x7c,0x08,0x5a,0xe3]
+          vcvtps2pd %xmm27, %xmm20
+
+// CHECK: vcvtps2pd %xmm27, %xmm20 {%k3}
+// CHECK:  encoding: [0x62,0x81,0x7c,0x0b,0x5a,0xe3]
+          vcvtps2pd %xmm27, %xmm20 {%k3}
+
+// CHECK: vcvtps2pd %xmm27, %xmm20 {%k3} {z}
+// CHECK:  encoding: [0x62,0x81,0x7c,0x8b,0x5a,0xe3]
+          vcvtps2pd %xmm27, %xmm20 {%k3} {z}
+
+// CHECK: vcvtps2pd (%rcx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x5a,0x21]
+          vcvtps2pd (%rcx), %xmm20
+
+// CHECK: vcvtps2pd 291(%rax,%r14,8), %xmm20
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x08,0x5a,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtps2pd 291(%rax,%r14,8), %xmm20
+
+// CHECK: vcvtps2pd (%rcx){1to2}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x18,0x5a,0x21]
+          vcvtps2pd (%rcx){1to2}, %xmm20
+
+// CHECK: vcvtps2pd 1016(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x5a,0x62,0x7f]
+          vcvtps2pd 1016(%rdx), %xmm20
+
+// CHECK: vcvtps2pd 1024(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x5a,0xa2,0x00,0x04,0x00,0x00]
+          vcvtps2pd 1024(%rdx), %xmm20
+
+// CHECK: vcvtps2pd -1024(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x5a,0x62,0x80]
+          vcvtps2pd -1024(%rdx), %xmm20
+
+// CHECK: vcvtps2pd -1032(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x5a,0xa2,0xf8,0xfb,0xff,0xff]
+          vcvtps2pd -1032(%rdx), %xmm20
+
+// CHECK: vcvtps2pd 508(%rdx){1to2}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x18,0x5a,0x62,0x7f]
+          vcvtps2pd 508(%rdx){1to2}, %xmm20
+
+// CHECK: vcvtps2pd 512(%rdx){1to2}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x18,0x5a,0xa2,0x00,0x02,0x00,0x00]
+          vcvtps2pd 512(%rdx){1to2}, %xmm20
+
+// CHECK: vcvtps2pd -512(%rdx){1to2}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x18,0x5a,0x62,0x80]
+          vcvtps2pd -512(%rdx){1to2}, %xmm20
+
+// CHECK: vcvtps2pd -516(%rdx){1to2}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x18,0x5a,0xa2,0xfc,0xfd,0xff,0xff]
+          vcvtps2pd -516(%rdx){1to2}, %xmm20
+
+// CHECK: vcvtps2pd %xmm18, %ymm22
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x28,0x5a,0xf2]
+          vcvtps2pd %xmm18, %ymm22
+
+// CHECK: vcvtps2pd %xmm18, %ymm22 {%k2}
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x2a,0x5a,0xf2]
+          vcvtps2pd %xmm18, %ymm22 {%k2}
+
+// CHECK: vcvtps2pd %xmm18, %ymm22 {%k2} {z}
+// CHECK:  encoding: [0x62,0xa1,0x7c,0xaa,0x5a,0xf2]
+          vcvtps2pd %xmm18, %ymm22 {%k2} {z}
+
+// CHECK: vcvtps2pd (%rcx), %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x5a,0x31]
+          vcvtps2pd (%rcx), %ymm22
+
+// CHECK: vcvtps2pd 291(%rax,%r14,8), %ymm22
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x28,0x5a,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtps2pd 291(%rax,%r14,8), %ymm22
+
+// CHECK: vcvtps2pd (%rcx){1to4}, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x38,0x5a,0x31]
+          vcvtps2pd (%rcx){1to4}, %ymm22
+
+// CHECK: vcvtps2pd 2032(%rdx), %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x5a,0x72,0x7f]
+          vcvtps2pd 2032(%rdx), %ymm22
+
+// CHECK: vcvtps2pd 2048(%rdx), %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x5a,0xb2,0x00,0x08,0x00,0x00]
+          vcvtps2pd 2048(%rdx), %ymm22
+
+// CHECK: vcvtps2pd -2048(%rdx), %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x5a,0x72,0x80]
+          vcvtps2pd -2048(%rdx), %ymm22
+
+// CHECK: vcvtps2pd -2064(%rdx), %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x5a,0xb2,0xf0,0xf7,0xff,0xff]
+          vcvtps2pd -2064(%rdx), %ymm22
+
+// CHECK: vcvtps2pd 508(%rdx){1to4}, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x38,0x5a,0x72,0x7f]
+          vcvtps2pd 508(%rdx){1to4}, %ymm22
+
+// CHECK: vcvtps2pd 512(%rdx){1to4}, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x38,0x5a,0xb2,0x00,0x02,0x00,0x00]
+          vcvtps2pd 512(%rdx){1to4}, %ymm22
+
+// CHECK: vcvtps2pd -512(%rdx){1to4}, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x38,0x5a,0x72,0x80]
+          vcvtps2pd -512(%rdx){1to4}, %ymm22
+
+// CHECK: vcvtps2pd -516(%rdx){1to4}, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x38,0x5a,0xb2,0xfc,0xfd,0xff,0xff]
+          vcvtps2pd -516(%rdx){1to4}, %ymm22
+
+// CHECK: vcvtpd2ps %xmm27, %xmm27
+// CHECK:  encoding: [0x62,0x01,0xfd,0x08,0x5a,0xdb]
+          vcvtpd2ps %xmm27, %xmm27
+
+// CHECK: vcvtpd2ps %xmm27, %xmm27 {%k7}
+// CHECK:  encoding: [0x62,0x01,0xfd,0x0f,0x5a,0xdb]
+          vcvtpd2ps %xmm27, %xmm27 {%k7}
+
+// CHECK: vcvtpd2ps %xmm27, %xmm27 {%k7} {z}
+// CHECK:  encoding: [0x62,0x01,0xfd,0x8f,0x5a,0xdb]
+          vcvtpd2ps %xmm27, %xmm27 {%k7} {z}
+
+// CHECK: vcvtpd2psx (%rcx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x5a,0x19]
+          vcvtpd2psx (%rcx), %xmm27
+
+// CHECK: vcvtpd2psx 291(%rax,%r14,8), %xmm27
+// CHECK:  encoding: [0x62,0x21,0xfd,0x08,0x5a,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vcvtpd2psx 291(%rax,%r14,8), %xmm27
+
+// CHECK: vcvtpd2ps (%rcx){1to2}, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x5a,0x19]
+          vcvtpd2ps (%rcx){1to2}, %xmm27
+
+// CHECK: vcvtpd2psx 2032(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x5a,0x5a,0x7f]
+          vcvtpd2psx 2032(%rdx), %xmm27
+
+// CHECK: vcvtpd2psx 2048(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x5a,0x9a,0x00,0x08,0x00,0x00]
+          vcvtpd2psx 2048(%rdx), %xmm27
+
+// CHECK: vcvtpd2psx -2048(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x5a,0x5a,0x80]
+          vcvtpd2psx -2048(%rdx), %xmm27
+
+// CHECK: vcvtpd2psx -2064(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x5a,0x9a,0xf0,0xf7,0xff,0xff]
+          vcvtpd2psx -2064(%rdx), %xmm27
+
+// CHECK: vcvtpd2ps 1016(%rdx){1to2}, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x5a,0x5a,0x7f]
+          vcvtpd2ps 1016(%rdx){1to2}, %xmm27
+
+// CHECK: vcvtpd2ps 1024(%rdx){1to2}, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x5a,0x9a,0x00,0x04,0x00,0x00]
+          vcvtpd2ps 1024(%rdx){1to2}, %xmm27
+
+// CHECK: vcvtpd2ps -1024(%rdx){1to2}, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x5a,0x5a,0x80]
+          vcvtpd2ps -1024(%rdx){1to2}, %xmm27
+
+// CHECK: vcvtpd2ps -1032(%rdx){1to2}, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x5a,0x9a,0xf8,0xfb,0xff,0xff]
+          vcvtpd2ps -1032(%rdx){1to2}, %xmm27
+
+// CHECK: vcvtpd2ps %ymm20, %xmm20
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x28,0x5a,0xe4]
+          vcvtpd2ps %ymm20, %xmm20
+
+// CHECK: vcvtpd2ps %ymm20, %xmm20 {%k6}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x2e,0x5a,0xe4]
+          vcvtpd2ps %ymm20, %xmm20 {%k6}
+
+// CHECK: vcvtpd2ps %ymm20, %xmm20 {%k6} {z}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0xae,0x5a,0xe4]
+          vcvtpd2ps %ymm20, %xmm20 {%k6} {z}
+
+// CHECK: vcvtpd2psy (%rcx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x5a,0x21]
+          vcvtpd2psy (%rcx), %xmm20
+
+// CHECK: vcvtpd2psy 291(%rax,%r14,8), %xmm20
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x28,0x5a,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtpd2psy 291(%rax,%r14,8), %xmm20
+
+// CHECK: vcvtpd2ps (%rcx){1to4}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x5a,0x21]
+          vcvtpd2ps (%rcx){1to4}, %xmm20
+
+// CHECK: vcvtpd2psy 4064(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x5a,0x62,0x7f]
+          vcvtpd2psy 4064(%rdx), %xmm20
+
+// CHECK: vcvtpd2psy 4096(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x5a,0xa2,0x00,0x10,0x00,0x00]
+          vcvtpd2psy 4096(%rdx), %xmm20
+
+// CHECK: vcvtpd2psy -4096(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x5a,0x62,0x80]
+          vcvtpd2psy -4096(%rdx), %xmm20
+
+// CHECK: vcvtpd2psy -4128(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x5a,0xa2,0xe0,0xef,0xff,0xff]
+          vcvtpd2psy -4128(%rdx), %xmm20
+
+// CHECK: vcvtpd2ps 1016(%rdx){1to4}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x5a,0x62,0x7f]
+          vcvtpd2ps 1016(%rdx){1to4}, %xmm20
+
+// CHECK: vcvtpd2ps 1024(%rdx){1to4}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x5a,0xa2,0x00,0x04,0x00,0x00]
+          vcvtpd2ps 1024(%rdx){1to4}, %xmm20
+
+// CHECK: vcvtpd2ps -1024(%rdx){1to4}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x5a,0x62,0x80]
+          vcvtpd2ps -1024(%rdx){1to4}, %xmm20
+
+// CHECK: vcvtpd2ps -1032(%rdx){1to4}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x5a,0xa2,0xf8,0xfb,0xff,0xff]
+          vcvtpd2ps -1032(%rdx){1to4}, %xmm20
+
+// CHECK: vcvtpd2udq %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x21,0xfc,0x08,0x79,0xe6]
+          vcvtpd2udq %xmm22, %xmm28
+
+// CHECK: vcvtpd2udq %xmm22, %xmm28 {%k3}
+// CHECK:  encoding: [0x62,0x21,0xfc,0x0b,0x79,0xe6]
+          vcvtpd2udq %xmm22, %xmm28 {%k3}
+
+// CHECK: vcvtpd2udq %xmm22, %xmm28 {%k3} {z}
+// CHECK:  encoding: [0x62,0x21,0xfc,0x8b,0x79,0xe6]
+          vcvtpd2udq %xmm22, %xmm28 {%k3} {z}
+
+// CHECK: vcvtpd2udqx (%rcx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0xfc,0x08,0x79,0x21]
+          vcvtpd2udqx (%rcx), %xmm28
+
+// CHECK: vcvtpd2udqx 291(%rax,%r14,8), %xmm28
+// CHECK:  encoding: [0x62,0x21,0xfc,0x08,0x79,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtpd2udqx 291(%rax,%r14,8), %xmm28
+
+// CHECK: vcvtpd2udq (%rcx){1to2}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xfc,0x18,0x79,0x21]
+          vcvtpd2udq (%rcx){1to2}, %xmm28
+
+// CHECK: vcvtpd2udqx 2032(%rdx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0xfc,0x08,0x79,0x62,0x7f]
+          vcvtpd2udqx 2032(%rdx), %xmm28
+
+// CHECK: vcvtpd2udqx 2048(%rdx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0xfc,0x08,0x79,0xa2,0x00,0x08,0x00,0x00]
+          vcvtpd2udqx 2048(%rdx), %xmm28
+
+// CHECK: vcvtpd2udqx -2048(%rdx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0xfc,0x08,0x79,0x62,0x80]
+          vcvtpd2udqx -2048(%rdx), %xmm28
+
+// CHECK: vcvtpd2udqx -2064(%rdx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0xfc,0x08,0x79,0xa2,0xf0,0xf7,0xff,0xff]
+          vcvtpd2udqx -2064(%rdx), %xmm28
+
+// CHECK: vcvtpd2udq 1016(%rdx){1to2}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xfc,0x18,0x79,0x62,0x7f]
+          vcvtpd2udq 1016(%rdx){1to2}, %xmm28
+
+// CHECK: vcvtpd2udq 1024(%rdx){1to2}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xfc,0x18,0x79,0xa2,0x00,0x04,0x00,0x00]
+          vcvtpd2udq 1024(%rdx){1to2}, %xmm28
+
+// CHECK: vcvtpd2udq -1024(%rdx){1to2}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xfc,0x18,0x79,0x62,0x80]
+          vcvtpd2udq -1024(%rdx){1to2}, %xmm28
+
+// CHECK: vcvtpd2udq -1032(%rdx){1to2}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xfc,0x18,0x79,0xa2,0xf8,0xfb,0xff,0xff]
+          vcvtpd2udq -1032(%rdx){1to2}, %xmm28
+
+// CHECK: vcvtpd2udq %ymm20, %xmm21
+// CHECK:  encoding: [0x62,0xa1,0xfc,0x28,0x79,0xec]
+          vcvtpd2udq %ymm20, %xmm21
+
+// CHECK: vcvtpd2udq %ymm20, %xmm21 {%k2}
+// CHECK:  encoding: [0x62,0xa1,0xfc,0x2a,0x79,0xec]
+          vcvtpd2udq %ymm20, %xmm21 {%k2}
+
+// CHECK: vcvtpd2udq %ymm20, %xmm21 {%k2} {z}
+// CHECK:  encoding: [0x62,0xa1,0xfc,0xaa,0x79,0xec]
+          vcvtpd2udq %ymm20, %xmm21 {%k2} {z}
+
+// CHECK: vcvtpd2udqy (%rcx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x28,0x79,0x29]
+          vcvtpd2udqy (%rcx), %xmm21
+
+// CHECK: vcvtpd2udqy 291(%rax,%r14,8), %xmm21
+// CHECK:  encoding: [0x62,0xa1,0xfc,0x28,0x79,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vcvtpd2udqy 291(%rax,%r14,8), %xmm21
+
+// CHECK: vcvtpd2udq (%rcx){1to4}, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x38,0x79,0x29]
+          vcvtpd2udq (%rcx){1to4}, %xmm21
+
+// CHECK: vcvtpd2udqy 4064(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x28,0x79,0x6a,0x7f]
+          vcvtpd2udqy 4064(%rdx), %xmm21
+
+// CHECK: vcvtpd2udqy 4096(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x28,0x79,0xaa,0x00,0x10,0x00,0x00]
+          vcvtpd2udqy 4096(%rdx), %xmm21
+
+// CHECK: vcvtpd2udqy -4096(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x28,0x79,0x6a,0x80]
+          vcvtpd2udqy -4096(%rdx), %xmm21
+
+// CHECK: vcvtpd2udqy -4128(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x28,0x79,0xaa,0xe0,0xef,0xff,0xff]
+          vcvtpd2udqy -4128(%rdx), %xmm21
+
+// CHECK: vcvtpd2udq 1016(%rdx){1to4}, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x38,0x79,0x6a,0x7f]
+          vcvtpd2udq 1016(%rdx){1to4}, %xmm21
+
+// CHECK: vcvtpd2udq 1024(%rdx){1to4}, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x38,0x79,0xaa,0x00,0x04,0x00,0x00]
+          vcvtpd2udq 1024(%rdx){1to4}, %xmm21
+
+// CHECK: vcvtpd2udq -1024(%rdx){1to4}, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x38,0x79,0x6a,0x80]
+          vcvtpd2udq -1024(%rdx){1to4}, %xmm21
+
+// CHECK: vcvtpd2udq -1032(%rdx){1to4}, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfc,0x38,0x79,0xaa,0xf8,0xfb,0xff,0xff]
+          vcvtpd2udq -1032(%rdx){1to4}, %xmm21
+// CHECK: vcvtps2udq %xmm19, %xmm30
+// CHECK:  encoding: [0x62,0x21,0x7c,0x08,0x79,0xf3]
+          vcvtps2udq %xmm19, %xmm30
+
+// CHECK: vcvtps2udq %xmm19, %xmm30 {%k3}
+// CHECK:  encoding: [0x62,0x21,0x7c,0x0b,0x79,0xf3]
+          vcvtps2udq %xmm19, %xmm30 {%k3}
+
+// CHECK: vcvtps2udq %xmm19, %xmm30 {%k3} {z}
+// CHECK:  encoding: [0x62,0x21,0x7c,0x8b,0x79,0xf3]
+          vcvtps2udq %xmm19, %xmm30 {%k3} {z}
+
+// CHECK: vcvtps2udq (%rcx), %xmm30
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x79,0x31]
+          vcvtps2udq (%rcx), %xmm30
+
+// CHECK: vcvtps2udq 291(%rax,%r14,8), %xmm30
+// CHECK:  encoding: [0x62,0x21,0x7c,0x08,0x79,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtps2udq 291(%rax,%r14,8), %xmm30
+
+// CHECK: vcvtps2udq (%rcx){1to4}, %xmm30
+// CHECK:  encoding: [0x62,0x61,0x7c,0x18,0x79,0x31]
+          vcvtps2udq (%rcx){1to4}, %xmm30
+
+// CHECK: vcvtps2udq 2032(%rdx), %xmm30
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x79,0x72,0x7f]
+          vcvtps2udq 2032(%rdx), %xmm30
+
+// CHECK: vcvtps2udq 2048(%rdx), %xmm30
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x79,0xb2,0x00,0x08,0x00,0x00]
+          vcvtps2udq 2048(%rdx), %xmm30
+
+// CHECK: vcvtps2udq -2048(%rdx), %xmm30
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x79,0x72,0x80]
+          vcvtps2udq -2048(%rdx), %xmm30
+
+// CHECK: vcvtps2udq -2064(%rdx), %xmm30
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x79,0xb2,0xf0,0xf7,0xff,0xff]
+          vcvtps2udq -2064(%rdx), %xmm30
+
+// CHECK: vcvtps2udq 508(%rdx){1to4}, %xmm30
+// CHECK:  encoding: [0x62,0x61,0x7c,0x18,0x79,0x72,0x7f]
+          vcvtps2udq 508(%rdx){1to4}, %xmm30
+
+// CHECK: vcvtps2udq 512(%rdx){1to4}, %xmm30
+// CHECK:  encoding: [0x62,0x61,0x7c,0x18,0x79,0xb2,0x00,0x02,0x00,0x00]
+          vcvtps2udq 512(%rdx){1to4}, %xmm30
+
+// CHECK: vcvtps2udq -512(%rdx){1to4}, %xmm30
+// CHECK:  encoding: [0x62,0x61,0x7c,0x18,0x79,0x72,0x80]
+          vcvtps2udq -512(%rdx){1to4}, %xmm30
+
+// CHECK: vcvtps2udq -516(%rdx){1to4}, %xmm30
+// CHECK:  encoding: [0x62,0x61,0x7c,0x18,0x79,0xb2,0xfc,0xfd,0xff,0xff]
+          vcvtps2udq -516(%rdx){1to4}, %xmm30
+
+// CHECK: vcvtps2udq %ymm23, %ymm25
+// CHECK:  encoding: [0x62,0x21,0x7c,0x28,0x79,0xcf]
+          vcvtps2udq %ymm23, %ymm25
+
+// CHECK: vcvtps2udq %ymm23, %ymm25 {%k4}
+// CHECK:  encoding: [0x62,0x21,0x7c,0x2c,0x79,0xcf]
+          vcvtps2udq %ymm23, %ymm25 {%k4}
+
+// CHECK: vcvtps2udq %ymm23, %ymm25 {%k4} {z}
+// CHECK:  encoding: [0x62,0x21,0x7c,0xac,0x79,0xcf]
+          vcvtps2udq %ymm23, %ymm25 {%k4} {z}
+
+// CHECK: vcvtps2udq (%rcx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x79,0x09]
+          vcvtps2udq (%rcx), %ymm25
+
+// CHECK: vcvtps2udq 291(%rax,%r14,8), %ymm25
+// CHECK:  encoding: [0x62,0x21,0x7c,0x28,0x79,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vcvtps2udq 291(%rax,%r14,8), %ymm25
+
+// CHECK: vcvtps2udq (%rcx){1to8}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x38,0x79,0x09]
+          vcvtps2udq (%rcx){1to8}, %ymm25
+
+// CHECK: vcvtps2udq 4064(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x79,0x4a,0x7f]
+          vcvtps2udq 4064(%rdx), %ymm25
+
+// CHECK: vcvtps2udq 4096(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x79,0x8a,0x00,0x10,0x00,0x00]
+          vcvtps2udq 4096(%rdx), %ymm25
+
+// CHECK: vcvtps2udq -4096(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x79,0x4a,0x80]
+          vcvtps2udq -4096(%rdx), %ymm25
+
+// CHECK: vcvtps2udq -4128(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x79,0x8a,0xe0,0xef,0xff,0xff]
+          vcvtps2udq -4128(%rdx), %ymm25
+
+// CHECK: vcvtps2udq 508(%rdx){1to8}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x38,0x79,0x4a,0x7f]
+          vcvtps2udq 508(%rdx){1to8}, %ymm25
+
+// CHECK: vcvtps2udq 512(%rdx){1to8}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x38,0x79,0x8a,0x00,0x02,0x00,0x00]
+          vcvtps2udq 512(%rdx){1to8}, %ymm25
+
+// CHECK: vcvtps2udq -512(%rdx){1to8}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x38,0x79,0x4a,0x80]
+          vcvtps2udq -512(%rdx){1to8}, %ymm25
+
+// CHECK: vcvtps2udq -516(%rdx){1to8}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x38,0x79,0x8a,0xfc,0xfd,0xff,0xff]
+          vcvtps2udq -516(%rdx){1to8}, %ymm25
+
+// CHECK: vcvttpd2dq %xmm23, %xmm29
+// CHECK:  encoding: [0x62,0x21,0xfd,0x08,0xe6,0xef]
+          vcvttpd2dq %xmm23, %xmm29
+
+// CHECK: vcvttpd2dq %xmm23, %xmm29 {%k6}
+// CHECK:  encoding: [0x62,0x21,0xfd,0x0e,0xe6,0xef]
+          vcvttpd2dq %xmm23, %xmm29 {%k6}
+
+// CHECK: vcvttpd2dq %xmm23, %xmm29 {%k6} {z}
+// CHECK:  encoding: [0x62,0x21,0xfd,0x8e,0xe6,0xef]
+          vcvttpd2dq %xmm23, %xmm29 {%k6} {z}
+
+// CHECK: vcvttpd2dqx (%rcx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0xe6,0x29]
+          vcvttpd2dqx (%rcx), %xmm29
+
+// CHECK: vcvttpd2dqx 291(%rax,%r14,8), %xmm29
+// CHECK:  encoding: [0x62,0x21,0xfd,0x08,0xe6,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vcvttpd2dqx 291(%rax,%r14,8), %xmm29
+
+// CHECK: vcvttpd2dq (%rcx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0xe6,0x29]
+          vcvttpd2dq (%rcx){1to2}, %xmm29
+
+// CHECK: vcvttpd2dqx 2032(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0xe6,0x6a,0x7f]
+          vcvttpd2dqx 2032(%rdx), %xmm29
+
+// CHECK: vcvttpd2dqx 2048(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0xe6,0xaa,0x00,0x08,0x00,0x00]
+          vcvttpd2dqx 2048(%rdx), %xmm29
+
+// CHECK: vcvttpd2dqx -2048(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0xe6,0x6a,0x80]
+          vcvttpd2dqx -2048(%rdx), %xmm29
+
+// CHECK: vcvttpd2dqx -2064(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0xe6,0xaa,0xf0,0xf7,0xff,0xff]
+          vcvttpd2dqx -2064(%rdx), %xmm29
+
+// CHECK: vcvttpd2dq 1016(%rdx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0xe6,0x6a,0x7f]
+          vcvttpd2dq 1016(%rdx){1to2}, %xmm29
+
+// CHECK: vcvttpd2dq 1024(%rdx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0xe6,0xaa,0x00,0x04,0x00,0x00]
+          vcvttpd2dq 1024(%rdx){1to2}, %xmm29
+
+// CHECK: vcvttpd2dq -1024(%rdx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0xe6,0x6a,0x80]
+          vcvttpd2dq -1024(%rdx){1to2}, %xmm29
+
+// CHECK: vcvttpd2dq -1032(%rdx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0xe6,0xaa,0xf8,0xfb,0xff,0xff]
+          vcvttpd2dq -1032(%rdx){1to2}, %xmm29
+
+// CHECK: vcvttpd2dq %ymm29, %xmm30
+// CHECK:  encoding: [0x62,0x01,0xfd,0x28,0xe6,0xf5]
+          vcvttpd2dq %ymm29, %xmm30
+
+// CHECK: vcvttpd2dq %ymm29, %xmm30 {%k6}
+// CHECK:  encoding: [0x62,0x01,0xfd,0x2e,0xe6,0xf5]
+          vcvttpd2dq %ymm29, %xmm30 {%k6}
+
+// CHECK: vcvttpd2dq %ymm29, %xmm30 {%k6} {z}
+// CHECK:  encoding: [0x62,0x01,0xfd,0xae,0xe6,0xf5]
+          vcvttpd2dq %ymm29, %xmm30 {%k6} {z}
+
+// CHECK: vcvttpd2dqy (%rcx), %xmm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0xe6,0x31]
+          vcvttpd2dqy (%rcx), %xmm30
+
+// CHECK: vcvttpd2dqy 291(%rax,%r14,8), %xmm30
+// CHECK:  encoding: [0x62,0x21,0xfd,0x28,0xe6,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vcvttpd2dqy 291(%rax,%r14,8), %xmm30
+
+// CHECK: vcvttpd2dq (%rcx){1to4}, %xmm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x38,0xe6,0x31]
+          vcvttpd2dq (%rcx){1to4}, %xmm30
+
+// CHECK: vcvttpd2dqy 4064(%rdx), %xmm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0xe6,0x72,0x7f]
+          vcvttpd2dqy 4064(%rdx), %xmm30
+
+// CHECK: vcvttpd2dqy 4096(%rdx), %xmm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0xe6,0xb2,0x00,0x10,0x00,0x00]
+          vcvttpd2dqy 4096(%rdx), %xmm30
+
+// CHECK: vcvttpd2dqy -4096(%rdx), %xmm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0xe6,0x72,0x80]
+          vcvttpd2dqy -4096(%rdx), %xmm30
+
+// CHECK: vcvttpd2dqy -4128(%rdx), %xmm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0xe6,0xb2,0xe0,0xef,0xff,0xff]
+          vcvttpd2dqy -4128(%rdx), %xmm30
+
+// CHECK: vcvttpd2dq 1016(%rdx){1to4}, %xmm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x38,0xe6,0x72,0x7f]
+          vcvttpd2dq 1016(%rdx){1to4}, %xmm30
+
+// CHECK: vcvttpd2dq 1024(%rdx){1to4}, %xmm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x38,0xe6,0xb2,0x00,0x04,0x00,0x00]
+          vcvttpd2dq 1024(%rdx){1to4}, %xmm30
+
+// CHECK: vcvttpd2dq -1024(%rdx){1to4}, %xmm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x38,0xe6,0x72,0x80]
+          vcvttpd2dq -1024(%rdx){1to4}, %xmm30
+
+// CHECK: vcvttpd2dq -1032(%rdx){1to4}, %xmm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x38,0xe6,0xb2,0xf8,0xfb,0xff,0xff]
+          vcvttpd2dq -1032(%rdx){1to4}, %xmm30
+
+// CHECK: vcvttps2dq %xmm22, %xmm17
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x08,0x5b,0xce]
+          vcvttps2dq %xmm22, %xmm17
+
+// CHECK: vcvttps2dq %xmm22, %xmm17 {%k4}
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x0c,0x5b,0xce]
+          vcvttps2dq %xmm22, %xmm17 {%k4}
+
+// CHECK: vcvttps2dq %xmm22, %xmm17 {%k4} {z}
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x8c,0x5b,0xce]
+          vcvttps2dq %xmm22, %xmm17 {%k4} {z}
+
+// CHECK: vcvttps2dq (%rcx), %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0x5b,0x09]
+          vcvttps2dq (%rcx), %xmm17
+
+// CHECK: vcvttps2dq 291(%rax,%r14,8), %xmm17
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x08,0x5b,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vcvttps2dq 291(%rax,%r14,8), %xmm17
+
+// CHECK: vcvttps2dq (%rcx){1to4}, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x18,0x5b,0x09]
+          vcvttps2dq (%rcx){1to4}, %xmm17
+
+// CHECK: vcvttps2dq 2032(%rdx), %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0x5b,0x4a,0x7f]
+          vcvttps2dq 2032(%rdx), %xmm17
+
+// CHECK: vcvttps2dq 2048(%rdx), %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0x5b,0x8a,0x00,0x08,0x00,0x00]
+          vcvttps2dq 2048(%rdx), %xmm17
+
+// CHECK: vcvttps2dq -2048(%rdx), %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0x5b,0x4a,0x80]
+          vcvttps2dq -2048(%rdx), %xmm17
+
+// CHECK: vcvttps2dq -2064(%rdx), %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0x5b,0x8a,0xf0,0xf7,0xff,0xff]
+          vcvttps2dq -2064(%rdx), %xmm17
+
+// CHECK: vcvttps2dq 508(%rdx){1to4}, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x18,0x5b,0x4a,0x7f]
+          vcvttps2dq 508(%rdx){1to4}, %xmm17
+
+// CHECK: vcvttps2dq 512(%rdx){1to4}, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x18,0x5b,0x8a,0x00,0x02,0x00,0x00]
+          vcvttps2dq 512(%rdx){1to4}, %xmm17
+
+// CHECK: vcvttps2dq -512(%rdx){1to4}, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x18,0x5b,0x4a,0x80]
+          vcvttps2dq -512(%rdx){1to4}, %xmm17
+
+// CHECK: vcvttps2dq -516(%rdx){1to4}, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x18,0x5b,0x8a,0xfc,0xfd,0xff,0xff]
+          vcvttps2dq -516(%rdx){1to4}, %xmm17
+
+// CHECK: vcvttps2dq %ymm28, %ymm25
+// CHECK:  encoding: [0x62,0x01,0x7e,0x28,0x5b,0xcc]
+          vcvttps2dq %ymm28, %ymm25
+
+// CHECK: vcvttps2dq %ymm28, %ymm25 {%k2}
+// CHECK:  encoding: [0x62,0x01,0x7e,0x2a,0x5b,0xcc]
+          vcvttps2dq %ymm28, %ymm25 {%k2}
+
+// CHECK: vcvttps2dq %ymm28, %ymm25 {%k2} {z}
+// CHECK:  encoding: [0x62,0x01,0x7e,0xaa,0x5b,0xcc]
+          vcvttps2dq %ymm28, %ymm25 {%k2} {z}
+
+// CHECK: vcvttps2dq (%rcx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0x5b,0x09]
+          vcvttps2dq (%rcx), %ymm25
+
+// CHECK: vcvttps2dq 291(%rax,%r14,8), %ymm25
+// CHECK:  encoding: [0x62,0x21,0x7e,0x28,0x5b,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vcvttps2dq 291(%rax,%r14,8), %ymm25
+
+// CHECK: vcvttps2dq (%rcx){1to8}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0x5b,0x09]
+          vcvttps2dq (%rcx){1to8}, %ymm25
+
+// CHECK: vcvttps2dq 4064(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0x5b,0x4a,0x7f]
+          vcvttps2dq 4064(%rdx), %ymm25
+
+// CHECK: vcvttps2dq 4096(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0x5b,0x8a,0x00,0x10,0x00,0x00]
+          vcvttps2dq 4096(%rdx), %ymm25
+
+// CHECK: vcvttps2dq -4096(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0x5b,0x4a,0x80]
+          vcvttps2dq -4096(%rdx), %ymm25
+
+// CHECK: vcvttps2dq -4128(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0x5b,0x8a,0xe0,0xef,0xff,0xff]
+          vcvttps2dq -4128(%rdx), %ymm25
+
+// CHECK: vcvttps2dq 508(%rdx){1to8}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0x5b,0x4a,0x7f]
+          vcvttps2dq 508(%rdx){1to8}, %ymm25
+
+// CHECK: vcvttps2dq 512(%rdx){1to8}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0x5b,0x8a,0x00,0x02,0x00,0x00]
+          vcvttps2dq 512(%rdx){1to8}, %ymm25
+
+// CHECK: vcvttps2dq -512(%rdx){1to8}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0x5b,0x4a,0x80]
+          vcvttps2dq -512(%rdx){1to8}, %ymm25
+
+// CHECK: vcvttps2dq -516(%rdx){1to8}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0x5b,0x8a,0xfc,0xfd,0xff,0xff]
+          vcvttps2dq -516(%rdx){1to8}, %ymm25
+
+// CHECK: vcvtudq2pd %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x21,0x7e,0x08,0x7a,0xcb]
+          vcvtudq2pd %xmm19, %xmm25
+
+// CHECK: vcvtudq2pd %xmm19, %xmm25 {%k4}
+// CHECK:  encoding: [0x62,0x21,0x7e,0x0c,0x7a,0xcb]
+          vcvtudq2pd %xmm19, %xmm25 {%k4}
+
+// CHECK: vcvtudq2pd %xmm19, %xmm25 {%k4} {z}
+// CHECK:  encoding: [0x62,0x21,0x7e,0x8c,0x7a,0xcb]
+          vcvtudq2pd %xmm19, %xmm25 {%k4} {z}
+
+// CHECK: vcvtudq2pd (%rcx), %xmm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x08,0x7a,0x09]
+          vcvtudq2pd (%rcx), %xmm25
+
+// CHECK: vcvtudq2pd 291(%rax,%r14,8), %xmm25
+// CHECK:  encoding: [0x62,0x21,0x7e,0x08,0x7a,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vcvtudq2pd 291(%rax,%r14,8), %xmm25
+
+// CHECK: vcvtudq2pd (%rcx){1to2}, %xmm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x18,0x7a,0x09]
+          vcvtudq2pd (%rcx){1to2}, %xmm25
+
+// CHECK: vcvtudq2pd 1016(%rdx), %xmm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x08,0x7a,0x4a,0x7f]
+          vcvtudq2pd 1016(%rdx), %xmm25
+
+// CHECK: vcvtudq2pd 1024(%rdx), %xmm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x08,0x7a,0x8a,0x00,0x04,0x00,0x00]
+          vcvtudq2pd 1024(%rdx), %xmm25
+
+// CHECK: vcvtudq2pd -1024(%rdx), %xmm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x08,0x7a,0x4a,0x80]
+          vcvtudq2pd -1024(%rdx), %xmm25
+
+// CHECK: vcvtudq2pd -1032(%rdx), %xmm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x08,0x7a,0x8a,0xf8,0xfb,0xff,0xff]
+          vcvtudq2pd -1032(%rdx), %xmm25
+
+// CHECK: vcvtudq2pd 508(%rdx){1to2}, %xmm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x18,0x7a,0x4a,0x7f]
+          vcvtudq2pd 508(%rdx){1to2}, %xmm25
+
+// CHECK: vcvtudq2pd 512(%rdx){1to2}, %xmm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x18,0x7a,0x8a,0x00,0x02,0x00,0x00]
+          vcvtudq2pd 512(%rdx){1to2}, %xmm25
+
+// CHECK: vcvtudq2pd -512(%rdx){1to2}, %xmm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x18,0x7a,0x4a,0x80]
+          vcvtudq2pd -512(%rdx){1to2}, %xmm25
+
+// CHECK: vcvtudq2pd -516(%rdx){1to2}, %xmm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x18,0x7a,0x8a,0xfc,0xfd,0xff,0xff]
+          vcvtudq2pd -516(%rdx){1to2}, %xmm25
+
+// CHECK: vcvtudq2pd %xmm20, %ymm25
+// CHECK:  encoding: [0x62,0x21,0x7e,0x28,0x7a,0xcc]
+          vcvtudq2pd %xmm20, %ymm25
+
+// CHECK: vcvtudq2pd %xmm20, %ymm25 {%k6}
+// CHECK:  encoding: [0x62,0x21,0x7e,0x2e,0x7a,0xcc]
+          vcvtudq2pd %xmm20, %ymm25 {%k6}
+
+// CHECK: vcvtudq2pd %xmm20, %ymm25 {%k6} {z}
+// CHECK:  encoding: [0x62,0x21,0x7e,0xae,0x7a,0xcc]
+          vcvtudq2pd %xmm20, %ymm25 {%k6} {z}
+
+// CHECK: vcvtudq2pd (%rcx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0x7a,0x09]
+          vcvtudq2pd (%rcx), %ymm25
+
+// CHECK: vcvtudq2pd 291(%rax,%r14,8), %ymm25
+// CHECK:  encoding: [0x62,0x21,0x7e,0x28,0x7a,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vcvtudq2pd 291(%rax,%r14,8), %ymm25
+
+// CHECK: vcvtudq2pd (%rcx){1to4}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0x7a,0x09]
+          vcvtudq2pd (%rcx){1to4}, %ymm25
+
+// CHECK: vcvtudq2pd 2032(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0x7a,0x4a,0x7f]
+          vcvtudq2pd 2032(%rdx), %ymm25
+
+// CHECK: vcvtudq2pd 2048(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0x7a,0x8a,0x00,0x08,0x00,0x00]
+          vcvtudq2pd 2048(%rdx), %ymm25
+
+// CHECK: vcvtudq2pd -2048(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0x7a,0x4a,0x80]
+          vcvtudq2pd -2048(%rdx), %ymm25
+
+// CHECK: vcvtudq2pd -2064(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0x7a,0x8a,0xf0,0xf7,0xff,0xff]
+          vcvtudq2pd -2064(%rdx), %ymm25
+
+// CHECK: vcvtudq2pd 508(%rdx){1to4}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0x7a,0x4a,0x7f]
+          vcvtudq2pd 508(%rdx){1to4}, %ymm25
+
+// CHECK: vcvtudq2pd 512(%rdx){1to4}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0x7a,0x8a,0x00,0x02,0x00,0x00]
+          vcvtudq2pd 512(%rdx){1to4}, %ymm25
+
+// CHECK: vcvtudq2pd -512(%rdx){1to4}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0x7a,0x4a,0x80]
+          vcvtudq2pd -512(%rdx){1to4}, %ymm25
+
+// CHECK: vcvtudq2pd -516(%rdx){1to4}, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0x7a,0x8a,0xfc,0xfd,0xff,0xff]
+          vcvtudq2pd -516(%rdx){1to4}, %ymm25
+
+// CHECK: vcvtudq2ps %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0xa1,0x7f,0x08,0x7a,0xff]
+          vcvtudq2ps %xmm23, %xmm23
+
+// CHECK: vcvtudq2ps %xmm23, %xmm23 {%k1}
+// CHECK:  encoding: [0x62,0xa1,0x7f,0x09,0x7a,0xff]
+          vcvtudq2ps %xmm23, %xmm23 {%k1}
+
+// CHECK: vcvtudq2ps %xmm23, %xmm23 {%k1} {z}
+// CHECK:  encoding: [0x62,0xa1,0x7f,0x89,0x7a,0xff]
+          vcvtudq2ps %xmm23, %xmm23 {%k1} {z}
+
+// CHECK: vcvtudq2ps (%rcx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x08,0x7a,0x39]
+          vcvtudq2ps (%rcx), %xmm23
+
+// CHECK: vcvtudq2ps 291(%rax,%r14,8), %xmm23
+// CHECK:  encoding: [0x62,0xa1,0x7f,0x08,0x7a,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vcvtudq2ps 291(%rax,%r14,8), %xmm23
+
+// CHECK: vcvtudq2ps (%rcx){1to4}, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x18,0x7a,0x39]
+          vcvtudq2ps (%rcx){1to4}, %xmm23
+
+// CHECK: vcvtudq2ps 2032(%rdx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x08,0x7a,0x7a,0x7f]
+          vcvtudq2ps 2032(%rdx), %xmm23
+
+// CHECK: vcvtudq2ps 2048(%rdx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x08,0x7a,0xba,0x00,0x08,0x00,0x00]
+          vcvtudq2ps 2048(%rdx), %xmm23
+
+// CHECK: vcvtudq2ps -2048(%rdx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x08,0x7a,0x7a,0x80]
+          vcvtudq2ps -2048(%rdx), %xmm23
+
+// CHECK: vcvtudq2ps -2064(%rdx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x08,0x7a,0xba,0xf0,0xf7,0xff,0xff]
+          vcvtudq2ps -2064(%rdx), %xmm23
+
+// CHECK: vcvtudq2ps 508(%rdx){1to4}, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x18,0x7a,0x7a,0x7f]
+          vcvtudq2ps 508(%rdx){1to4}, %xmm23
+
+// CHECK: vcvtudq2ps 512(%rdx){1to4}, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x18,0x7a,0xba,0x00,0x02,0x00,0x00]
+          vcvtudq2ps 512(%rdx){1to4}, %xmm23
+
+// CHECK: vcvtudq2ps -512(%rdx){1to4}, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x18,0x7a,0x7a,0x80]
+          vcvtudq2ps -512(%rdx){1to4}, %xmm23
+
+// CHECK: vcvtudq2ps -516(%rdx){1to4}, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x18,0x7a,0xba,0xfc,0xfd,0xff,0xff]
+          vcvtudq2ps -516(%rdx){1to4}, %xmm23
+
+// CHECK: vcvtudq2ps %ymm25, %ymm23
+// CHECK:  encoding: [0x62,0x81,0x7f,0x28,0x7a,0xf9]
+          vcvtudq2ps %ymm25, %ymm23
+
+// CHECK: vcvtudq2ps %ymm25, %ymm23 {%k3}
+// CHECK:  encoding: [0x62,0x81,0x7f,0x2b,0x7a,0xf9]
+          vcvtudq2ps %ymm25, %ymm23 {%k3}
+
+// CHECK: vcvtudq2ps %ymm25, %ymm23 {%k3} {z}
+// CHECK:  encoding: [0x62,0x81,0x7f,0xab,0x7a,0xf9]
+          vcvtudq2ps %ymm25, %ymm23 {%k3} {z}
+
+// CHECK: vcvtudq2ps (%rcx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x7a,0x39]
+          vcvtudq2ps (%rcx), %ymm23
+
+// CHECK: vcvtudq2ps 291(%rax,%r14,8), %ymm23
+// CHECK:  encoding: [0x62,0xa1,0x7f,0x28,0x7a,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vcvtudq2ps 291(%rax,%r14,8), %ymm23
+
+// CHECK: vcvtudq2ps (%rcx){1to8}, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x38,0x7a,0x39]
+          vcvtudq2ps (%rcx){1to8}, %ymm23
+
+// CHECK: vcvtudq2ps 4064(%rdx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x7a,0x7a,0x7f]
+          vcvtudq2ps 4064(%rdx), %ymm23
+
+// CHECK: vcvtudq2ps 4096(%rdx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x7a,0xba,0x00,0x10,0x00,0x00]
+          vcvtudq2ps 4096(%rdx), %ymm23
+
+// CHECK: vcvtudq2ps -4096(%rdx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x7a,0x7a,0x80]
+          vcvtudq2ps -4096(%rdx), %ymm23
+
+// CHECK: vcvtudq2ps -4128(%rdx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x7a,0xba,0xe0,0xef,0xff,0xff]
+          vcvtudq2ps -4128(%rdx), %ymm23
+
+// CHECK: vcvtudq2ps 508(%rdx){1to8}, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x38,0x7a,0x7a,0x7f]
+          vcvtudq2ps 508(%rdx){1to8}, %ymm23
+
+// CHECK: vcvtudq2ps 512(%rdx){1to8}, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x38,0x7a,0xba,0x00,0x02,0x00,0x00]
+          vcvtudq2ps 512(%rdx){1to8}, %ymm23
+
+// CHECK: vcvtudq2ps -512(%rdx){1to8}, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x38,0x7a,0x7a,0x80]
+          vcvtudq2ps -512(%rdx){1to8}, %ymm23
+
+// CHECK: vcvtudq2ps -516(%rdx){1to8}, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x38,0x7a,0xba,0xfc,0xfd,0xff,0xff]
+          vcvtudq2ps -516(%rdx){1to8}, %ymm23
+
+// CHECK: vcvtdq2pd %xmm21, %xmm20
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x08,0xe6,0xe5]
+          vcvtdq2pd %xmm21, %xmm20
+
+// CHECK: vcvtdq2pd %xmm21, %xmm20 {%k5}
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x0d,0xe6,0xe5]
+          vcvtdq2pd %xmm21, %xmm20 {%k5}
+
+// CHECK: vcvtdq2pd %xmm21, %xmm20 {%k5} {z}
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x8d,0xe6,0xe5]
+          vcvtdq2pd %xmm21, %xmm20 {%k5} {z}
+
+// CHECK: vcvtdq2pd (%rcx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0xe6,0x21]
+          vcvtdq2pd (%rcx), %xmm20
+
+// CHECK: vcvtdq2pd 291(%rax,%r14,8), %xmm20
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x08,0xe6,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtdq2pd 291(%rax,%r14,8), %xmm20
+
+// CHECK: vcvtdq2pd (%rcx){1to2}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x18,0xe6,0x21]
+          vcvtdq2pd (%rcx){1to2}, %xmm20
+
+// CHECK: vcvtdq2pd 1016(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0xe6,0x62,0x7f]
+          vcvtdq2pd 1016(%rdx), %xmm20
+
+// CHECK: vcvtdq2pd 1024(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0xe6,0xa2,0x00,0x04,0x00,0x00]
+          vcvtdq2pd 1024(%rdx), %xmm20
+
+// CHECK: vcvtdq2pd -1024(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0xe6,0x62,0x80]
+          vcvtdq2pd -1024(%rdx), %xmm20
+
+// CHECK: vcvtdq2pd -1032(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0xe6,0xa2,0xf8,0xfb,0xff,0xff]
+          vcvtdq2pd -1032(%rdx), %xmm20
+
+// CHECK: vcvtdq2pd 508(%rdx){1to2}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x18,0xe6,0x62,0x7f]
+          vcvtdq2pd 508(%rdx){1to2}, %xmm20
+
+// CHECK: vcvtdq2pd 512(%rdx){1to2}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x18,0xe6,0xa2,0x00,0x02,0x00,0x00]
+          vcvtdq2pd 512(%rdx){1to2}, %xmm20
+
+// CHECK: vcvtdq2pd -512(%rdx){1to2}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x18,0xe6,0x62,0x80]
+          vcvtdq2pd -512(%rdx){1to2}, %xmm20
+
+// CHECK: vcvtdq2pd -516(%rdx){1to2}, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x18,0xe6,0xa2,0xfc,0xfd,0xff,0xff]
+          vcvtdq2pd -516(%rdx){1to2}, %xmm20
+
+// CHECK: vcvtdq2pd %xmm23, %ymm28
+// CHECK:  encoding: [0x62,0x21,0x7e,0x28,0xe6,0xe7]
+          vcvtdq2pd %xmm23, %ymm28
+
+// CHECK: vcvtdq2pd %xmm23, %ymm28 {%k2}
+// CHECK:  encoding: [0x62,0x21,0x7e,0x2a,0xe6,0xe7]
+          vcvtdq2pd %xmm23, %ymm28 {%k2}
+
+// CHECK: vcvtdq2pd %xmm23, %ymm28 {%k2} {z}
+// CHECK:  encoding: [0x62,0x21,0x7e,0xaa,0xe6,0xe7]
+          vcvtdq2pd %xmm23, %ymm28 {%k2} {z}
+
+// CHECK: vcvtdq2pd (%rcx), %ymm28
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0xe6,0x21]
+          vcvtdq2pd (%rcx), %ymm28
+
+// CHECK: vcvtdq2pd 291(%rax,%r14,8), %ymm28
+// CHECK:  encoding: [0x62,0x21,0x7e,0x28,0xe6,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtdq2pd 291(%rax,%r14,8), %ymm28
+
+// CHECK: vcvtdq2pd (%rcx){1to4}, %ymm28
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0xe6,0x21]
+          vcvtdq2pd (%rcx){1to4}, %ymm28
+
+// CHECK: vcvtdq2pd 2032(%rdx), %ymm28
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0xe6,0x62,0x7f]
+          vcvtdq2pd 2032(%rdx), %ymm28
+
+// CHECK: vcvtdq2pd 2048(%rdx), %ymm28
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0xe6,0xa2,0x00,0x08,0x00,0x00]
+          vcvtdq2pd 2048(%rdx), %ymm28
+
+// CHECK: vcvtdq2pd -2048(%rdx), %ymm28
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0xe6,0x62,0x80]
+          vcvtdq2pd -2048(%rdx), %ymm28
+
+// CHECK: vcvtdq2pd -2064(%rdx), %ymm28
+// CHECK:  encoding: [0x62,0x61,0x7e,0x28,0xe6,0xa2,0xf0,0xf7,0xff,0xff]
+          vcvtdq2pd -2064(%rdx), %ymm28
+
+// CHECK: vcvtdq2pd 508(%rdx){1to4}, %ymm28
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0xe6,0x62,0x7f]
+          vcvtdq2pd 508(%rdx){1to4}, %ymm28
+
+// CHECK: vcvtdq2pd 512(%rdx){1to4}, %ymm28
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0xe6,0xa2,0x00,0x02,0x00,0x00]
+          vcvtdq2pd 512(%rdx){1to4}, %ymm28
+
+// CHECK: vcvtdq2pd -512(%rdx){1to4}, %ymm28
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0xe6,0x62,0x80]
+          vcvtdq2pd -512(%rdx){1to4}, %ymm28
+
+// CHECK: vcvtdq2pd -516(%rdx){1to4}, %ymm28
+// CHECK:  encoding: [0x62,0x61,0x7e,0x38,0xe6,0xa2,0xfc,0xfd,0xff,0xff]
+          vcvtdq2pd -516(%rdx){1to4}, %ymm28
+
+// CHECK: vcvtdq2ps %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x21,0x7c,0x08,0x5b,0xe6]
+          vcvtdq2ps %xmm22, %xmm28
+
+// CHECK: vcvtdq2ps %xmm22, %xmm28 {%k1}
+// CHECK:  encoding: [0x62,0x21,0x7c,0x09,0x5b,0xe6]
+          vcvtdq2ps %xmm22, %xmm28 {%k1}
+
+// CHECK: vcvtdq2ps %xmm22, %xmm28 {%k1} {z}
+// CHECK:  encoding: [0x62,0x21,0x7c,0x89,0x5b,0xe6]
+          vcvtdq2ps %xmm22, %xmm28 {%k1} {z}
+
+// CHECK: vcvtdq2ps (%rcx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x5b,0x21]
+          vcvtdq2ps (%rcx), %xmm28
+
+// CHECK: vcvtdq2ps 291(%rax,%r14,8), %xmm28
+// CHECK:  encoding: [0x62,0x21,0x7c,0x08,0x5b,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vcvtdq2ps 291(%rax,%r14,8), %xmm28
+
+// CHECK: vcvtdq2ps (%rcx){1to4}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x18,0x5b,0x21]
+          vcvtdq2ps (%rcx){1to4}, %xmm28
+
+// CHECK: vcvtdq2ps 2032(%rdx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x5b,0x62,0x7f]
+          vcvtdq2ps 2032(%rdx), %xmm28
+
+// CHECK: vcvtdq2ps 2048(%rdx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x5b,0xa2,0x00,0x08,0x00,0x00]
+          vcvtdq2ps 2048(%rdx), %xmm28
+
+// CHECK: vcvtdq2ps -2048(%rdx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x5b,0x62,0x80]
+          vcvtdq2ps -2048(%rdx), %xmm28
+
+// CHECK: vcvtdq2ps -2064(%rdx), %xmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x5b,0xa2,0xf0,0xf7,0xff,0xff]
+          vcvtdq2ps -2064(%rdx), %xmm28
+
+// CHECK: vcvtdq2ps 508(%rdx){1to4}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x18,0x5b,0x62,0x7f]
+          vcvtdq2ps 508(%rdx){1to4}, %xmm28
+
+// CHECK: vcvtdq2ps 512(%rdx){1to4}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x18,0x5b,0xa2,0x00,0x02,0x00,0x00]
+          vcvtdq2ps 512(%rdx){1to4}, %xmm28
+
+// CHECK: vcvtdq2ps -512(%rdx){1to4}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x18,0x5b,0x62,0x80]
+          vcvtdq2ps -512(%rdx){1to4}, %xmm28
+
+// CHECK: vcvtdq2ps -516(%rdx){1to4}, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x18,0x5b,0xa2,0xfc,0xfd,0xff,0xff]
+          vcvtdq2ps -516(%rdx){1to4}, %xmm28
+
+// CHECK: vcvtdq2ps %ymm23, %ymm24
+// CHECK:  encoding: [0x62,0x21,0x7c,0x28,0x5b,0xc7]
+          vcvtdq2ps %ymm23, %ymm24
+
+// CHECK: vcvtdq2ps %ymm23, %ymm24 {%k3}
+// CHECK:  encoding: [0x62,0x21,0x7c,0x2b,0x5b,0xc7]
+          vcvtdq2ps %ymm23, %ymm24 {%k3}
+
+// CHECK: vcvtdq2ps %ymm23, %ymm24 {%k3} {z}
+// CHECK:  encoding: [0x62,0x21,0x7c,0xab,0x5b,0xc7]
+          vcvtdq2ps %ymm23, %ymm24 {%k3} {z}
+
+// CHECK: vcvtdq2ps (%rcx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x5b,0x01]
+          vcvtdq2ps (%rcx), %ymm24
+
+// CHECK: vcvtdq2ps 291(%rax,%r14,8), %ymm24
+// CHECK:  encoding: [0x62,0x21,0x7c,0x28,0x5b,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vcvtdq2ps 291(%rax,%r14,8), %ymm24
+
+// CHECK: vcvtdq2ps (%rcx){1to8}, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7c,0x38,0x5b,0x01]
+          vcvtdq2ps (%rcx){1to8}, %ymm24
+
+// CHECK: vcvtdq2ps 4064(%rdx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x5b,0x42,0x7f]
+          vcvtdq2ps 4064(%rdx), %ymm24
+
+// CHECK: vcvtdq2ps 4096(%rdx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x5b,0x82,0x00,0x10,0x00,0x00]
+          vcvtdq2ps 4096(%rdx), %ymm24
+
+// CHECK: vcvtdq2ps -4096(%rdx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x5b,0x42,0x80]
+          vcvtdq2ps -4096(%rdx), %ymm24
+
+// CHECK: vcvtdq2ps -4128(%rdx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x5b,0x82,0xe0,0xef,0xff,0xff]
+          vcvtdq2ps -4128(%rdx), %ymm24
+
+// CHECK: vcvtdq2ps 508(%rdx){1to8}, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7c,0x38,0x5b,0x42,0x7f]
+          vcvtdq2ps 508(%rdx){1to8}, %ymm24
+
+// CHECK: vcvtdq2ps 512(%rdx){1to8}, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7c,0x38,0x5b,0x82,0x00,0x02,0x00,0x00]
+          vcvtdq2ps 512(%rdx){1to8}, %ymm24
+
+// CHECK: vcvtdq2ps -512(%rdx){1to8}, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7c,0x38,0x5b,0x42,0x80]
+          vcvtdq2ps -512(%rdx){1to8}, %ymm24
+
+// CHECK: vcvtdq2ps -516(%rdx){1to8}, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7c,0x38,0x5b,0x82,0xfc,0xfd,0xff,0xff]
+          vcvtdq2ps -516(%rdx){1to8}, %ymm24
diff --git a/test/MC/X86/x86_errors.s b/test/MC/X86/x86_errors.s
index fa87ef676f3e..4a5bd977d9ff 100644
--- a/test/MC/X86/x86_errors.s
+++ b/test/MC/X86/x86_errors.s
@@ -58,3 +58,9 @@ cmpps $-129, %xmm0, %xmm0
 // 32: error: invalid operand for instruction
 // 64: error: invalid operand for instruction
 cmppd $256, %xmm0, %xmm0
+
+// 32: error: instruction requires: 64-bit mode
+jrcxz 1
+
+// 64: error: instruction requires: Not 64-bit mode
+jcxz 1
diff --git a/test/Object/Inputs/elf-mip64-reloc.o b/test/Object/Inputs/elf-mip64-reloc.o
new file mode 100644
index 000000000000..5e689254a43e
--- /dev/null
+++ b/test/Object/Inputs/elf-mip64-reloc.o
diff --git a/test/Object/Inputs/invalid-bad-section-address.coff b/test/Object/Inputs/invalid-bad-section-address.coff
new file mode 100644
index 000000000000..8d96e0137515
--- /dev/null
+++ b/test/Object/Inputs/invalid-bad-section-address.coff
diff --git a/test/Object/Inputs/no-section-table.so b/test/Object/Inputs/no-section-table.so
new file mode 100644
index 000000000000..fd176ebf7ce0
--- /dev/null
+++ b/test/Object/Inputs/no-section-table.so
diff --git a/test/Object/Inputs/symtab-only.a b/test/Object/Inputs/symtab-only.a
new file mode 100644
index 000000000000..611236bf3a15
--- /dev/null
+++ b/test/Object/Inputs/symtab-only.a
diff --git a/test/Object/Inputs/thin-path.a b/test/Object/Inputs/thin-path.a
new file mode 100644
index 000000000000..d48fd8746898
--- /dev/null
+++ b/test/Object/Inputs/thin-path.a
diff --git a/test/Object/Inputs/trivial-object-test2.macho-x86-64 b/test/Object/Inputs/trivial-object-test2.macho-x86-64
new file mode 100644
index 000000000000..75eedb6dd312
--- /dev/null
+++ b/test/Object/Inputs/trivial-object-test2.macho-x86-64
diff --git a/test/Object/Mips/elf-mips64-rel.yaml b/test/Object/Mips/elf-mips64-rel.yaml
index 8b595099b9e5..7c41b78e6adc 100644
--- a/test/Object/Mips/elf-mips64-rel.yaml
+++ b/test/Object/Mips/elf-mips64-rel.yaml
@@ -58,6 +58,7 @@ Sections:
   - Name:            .rela.text
     Type:            SHT_RELA
     Flags:           [ SHF_INFO_LINK ]
+    AddressAlign:    0x8
     Info:            .text
     Relocations:
       - Offset:      0x14
diff --git a/test/Object/Mips/reloc-visit.test b/test/Object/Mips/reloc-visit.test
new file mode 100644
index 000000000000..d75551a387ce
--- /dev/null
+++ b/test/Object/Mips/reloc-visit.test
@@ -0,0 +1,6 @@
+RUN: llvm-dwarfdump -debug-dump=info %p/../Inputs/elf-mip64-reloc.o 2>&1 | \
+RUN:   FileCheck %s
+
+CHECK: DW_AT_name [DW_FORM_strp]	( .debug_str[0x00000037] = "<stdin>")
+CHECK: DW_AT_name [DW_FORM_strp]	( .debug_str[0x00000054] = "foo")
+CHECK: DW_AT_name [DW_FORM_strp]	( .debug_str[0x00000058] = "int")
diff --git a/test/Object/X86/nm-coff.s b/test/Object/X86/nm-coff.s
new file mode 100644
index 000000000000..b8f28c7e52af
--- /dev/null
+++ b/test/Object/X86/nm-coff.s
@@ -0,0 +1,9 @@
+// RUN: llvm-mc %s -o %t -filetype=obj -triple=x86_64-pc-win32
+// RUN: llvm-nm --undefined-only %t | FileCheck %s
+// CHECK: w foo
+
+g:
+	movl	foo(%rip), %eax
+	retq
+
+	.weak	foo
diff --git a/test/Object/X86/nm-macho.s b/test/Object/X86/nm-macho.s
new file mode 100644
index 000000000000..7bdfa34ca366
--- /dev/null
+++ b/test/Object/X86/nm-macho.s
@@ -0,0 +1,9 @@
+// RUN: llvm-mc %s -o %t -filetype=obj -triple=x86_64-apple-darwin
+// RUN: llvm-nm -n %t | FileCheck %s
+// CHECK: 0000000000000000 t _f
+// CHECK: 0000000000000004 C _a
+
+_f:
+	retq
+
+	.comm	_a,4
diff --git a/test/Object/X86/nm-print-size.s b/test/Object/X86/nm-print-size.s
index b833601969c2..a755ffa3ded6 100644
--- a/test/Object/X86/nm-print-size.s
+++ b/test/Object/X86/nm-print-size.s
@@ -3,6 +3,8 @@
 
 // CHECK: 0000000000000000 ffffffffffffffff n a
 // CHECK: 0000000000000000 0000000000000000 N b
+// CHECK: 0000000000000004 0000000000000004 C c
+// CHECK: ffffffffffffffff 0000000000000000 a d
 
         .section foo
 a:
@@ -10,3 +12,7 @@ a:
 
         .global b
 b:
+
+        .comm c,4,8
+
+d = 0xffffffffffffffff
diff --git a/test/Object/extract.ll b/test/Object/archive-extract.test
index 8d77f09fe88c..a77adf2cabbd 100644
--- a/test/Object/extract.ll
+++ b/test/Object/archive-extract.test
@@ -1,5 +1,3 @@
-; This isn't really an assembly file, its just here to run the test.
-
 ; This test just makes sure that llvm-ar can extract bytecode members
 ; from various style archives.
 
@@ -39,7 +37,20 @@
 ; RUN: rm -f very_long_bytecode_file_name.bc
 ; RUN: llvm-ar xo %p/Inputs/GNU.a very_long_bytecode_file_name.bc
 ; RUN: rm -f %t.a
-; RUN: llvm-ar rc %t.a very_long_bytecode_file_name.bc
+; RUN: llvm-ar rcU %t.a very_long_bytecode_file_name.bc
 ; RUN: env TZ=GMT llvm-ar tv %t.a | FileCheck %s
 
 CHECK: 1465 2004-11-19 03:01:31.000000000 very_long_bytecode_file_name.bc
+
+
+RUN: not llvm-ar x %p/Inputs/GNU.a foo.o 2>&1 | FileCheck --check-prefix=NOTFOUND %s
+NOTFOUND: foo.o was not found
+
+RUN: not llvm-ar x %p/Inputs/thin.a foo.o 2>&1 | FileCheck %s --check-prefix=THINEXTRACT
+THINEXTRACT: extracting from a thin archive is not supported
+
+RUN: llvm-ar p %p/Inputs/thin.a evenlen | FileCheck %s --check-prefix=EVENLEN
+EVENLEN: evenlen
+
+RUN: not llvm-ar p %p/Inputs/thin-path.a t/test2.o | FileCheck %s --check-prefix=MISSING
+MISSING: No such file or directory.
diff --git a/test/Object/archive-format.test b/test/Object/archive-format.test
index f076123d2ea1..40af9a33d8d7 100644
--- a/test/Object/archive-format.test
+++ b/test/Object/archive-format.test
@@ -7,13 +7,23 @@ RUN: cd %t
 RUN: echo -n bar. > 0123456789abcde
 RUN: echo -n zed. > 0123456789abcdef
 
-RUN: rm -f test.a
-RUN: llvm-ar rc test.a 0123456789abcde 0123456789abcdef
-RUN: cat test.a | FileCheck -strict-whitespace %s
+RUN: rm -f %t.a
+RUN: llvm-ar --format=gnu rc %t.a 0123456789abcde 0123456789abcdef
+RUN: cat %t.a | FileCheck -strict-whitespace %s
 
 CHECK:      !<arch>
 CHECK-NEXT: //                                              18        `
 CHECK-NEXT: 0123456789abcdef/
-CHECK-NEXT: 0123456789abcde/{{................................}}4         `
-CHECK-NEXT: bar./0              {{................................}}4         `
+CHECK-NEXT: 0123456789abcde/0           0     0     644     4         `
+CHECK-NEXT: bar./0              0           0     0     644     4         `
 CHECK-NEXT: zed.
+
+RUN: rm -f %t.a
+RUN: llvm-ar --format=bsd rc %t.a 0123456789abcde 0123456789abcdef
+RUN: cat %t.a | FileCheck -strict-whitespace --check-prefix=BSD %s
+
+BSD:      !<arch>
+BSD-NEXT: #1/20           0           0     0     644     24        `
+BSD-NEXT: 0123456789abcde{{.....}}bar.
+BSD-SAME: #1/16           0           0     0     644     20        `
+BSD-NEXT: 0123456789abcdefzed.
diff --git a/test/Object/archive-symtab.test b/test/Object/archive-symtab.test
index 01f17bcc8b61..6e4c76fb3768 100644
--- a/test/Object/archive-symtab.test
+++ b/test/Object/archive-symtab.test
@@ -1,5 +1,5 @@
 RUN: rm -f %t.a
-RUN: llvm-ar rcs %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
+RUN: llvm-ar rcsU %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
 RUN: llvm-nm -M %t.a | FileCheck %s
 
 CHECK: Archive map
@@ -19,7 +19,7 @@ CHECK-NEXT: 0000000000000006 T foo
 CHECK-NEXT: 0000000000000016 T main
 
 RUN: rm -f %t.a
-RUN: llvm-ar rcS %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
+RUN: llvm-ar rcSU %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
 RUN: llvm-nm -M %t.a | FileCheck %s --check-prefix=NOMAP
 
 NOMAP-NOT: Archive map
@@ -54,7 +54,7 @@ RUN: llvm-nm -M %t.a | FileCheck %s --check-prefix=CORRUPT
 repeate the test with llvm-ranlib
 
 RUN: rm -f %t.a
-RUN: llvm-ar rcS %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
+RUN: llvm-ar rcSU %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
 RUN: llvm-nm -M %t.a | FileCheck %s --check-prefix=NOMAP
 
 RUN: llvm-ranlib %t.a
@@ -66,3 +66,32 @@ RUN: llvm-nm -M %p/Inputs/macho-archive-unsorted-x86_64.a | FileCheck %s --check
 BSD-MachO: Archive map
 BSD-MachO: _bar in bar.o
 BSD-MachO: _foo in foo.o
+
+RUN: rm -f %t.a
+RUN: llvm-ar --format=bsd rcsU %t.a %p/Inputs/trivial-object-test.macho-x86-64 %p/Inputs/trivial-object-test2.macho-x86-64
+RUN: llvm-nm -M %t.a | FileCheck --check-prefix=MACHO %s
+
+MACHO: Archive map
+MACHO-NEXT: _main in trivial-object-test.macho-x86-64
+MACHO-NEXT: _foo in trivial-object-test2.macho-x86-64
+MACHO-NEXT: _main in trivial-object-test2.macho-x86-64
+MACHO-NOT: bar
+
+MACHO: trivial-object-test.macho-x86-64
+MACHO-NEXT: 0000000000000028 s L_.str
+MACHO-NEXT:                  U _SomeOtherFunction
+MACHO-NEXT: 0000000000000000 T _main
+MACHO-NEXT:                  U _puts
+
+MACHO: trivial-object-test2.macho-x86-64
+MACHO-NEXT: 0000000000000000 t _bar
+MACHO-NEXT: 0000000000000001 T _foo
+MACHO-NEXT: 0000000000000002 T _main
+
+Test that we pad the symbol table so that it ends in a multiple of 4 bytes:
+8 + 60 + 36 == 104
+RUN: rm -f %t.a
+RUN: llvm-ar --format=bsd rcsU %t.a %p/Inputs/trivial-object-test.macho-x86-64
+RUN: FileCheck --check-prefix=MACHO-SYMTAB-ALIGN %s < %t.a
+MACHO-SYMTAB-ALIGN: !<arch>
+MACHO-SYMTAB-ALIGN-NEXT: #1/12           {{..........}}  0     0     0       36        `
diff --git a/test/Object/archive-toc.test b/test/Object/archive-toc.test
index 79a6e0e0ba80..cca4b868c46b 100644
--- a/test/Object/archive-toc.test
+++ b/test/Object/archive-toc.test
@@ -34,3 +34,13 @@ THIN:      rw-r--r-- 1000/1000      8 2014-12-16 00:56:27.000000000 evenlen
 THIN-NEXT: rw-r--r-- 1000/1000      7 2014-12-16 00:56:27.000000000 oddlen
 THIN-NEXT: rwxr-xr-x 1000/1000   1465 2014-12-16 00:56:27.000000000 very_long_bytecode_file_name.bc
 THIN-NEXT: rw-r--r-- 1000/1000   2280 2014-12-16 00:56:27.000000000 IsNAN.o
+
+Test reading an archive with just a symbol table. We use to reject them.
+RUN: llvm-ar tv %p/Inputs/symtab-only.a | FileCheck --allow-empty --check-prefix=EMPTY %s
+EMPTY-NOT: {{.}}
+
+Test reading a thin archive with directory names.
+RUN: env TZ=GMT llvm-ar tv %p/Inputs/thin-path.a | FileCheck %s --check-prefix=THINPATH -strict-whitespace
+
+THINPATH: rw-r--r-- 0/0   1224 1970-01-01 00:00:00.000000000 test.o
+THINPATH-NEXT: rw-r--r-- 0/0   1224 1970-01-01 00:00:00.000000000 t/test2.o
diff --git a/test/Object/archive-update.test b/test/Object/archive-update.test
index 91ca8102c7e1..147db90ba951 100644
--- a/test/Object/archive-update.test
+++ b/test/Object/archive-update.test
@@ -8,27 +8,28 @@ RUN: mkdir -p %t.older
 RUN: echo older > %t.older/evenlen
 
 Either the shell supports the 'touch' command with a flag to manually set the
-mtime or we sleep for over a second so that the mtime is definitely observable.
-RUN: touch -m -t 200001010000 %t.older/evenlen || sleep 1.1
+mtime or we sleep for over two seconds so that the mtime is definitely
+observable.
+RUN: touch -m -t 200001010000 %t.older/evenlen || sleep 2.1
 
 RUN: mkdir -p %t.newer
 RUN: echo newer > %t.newer/evenlen
 RUN: touch %t.newer/evenlen
 
 Create an achive with the newest file
-RUN: llvm-ar r %t.a %t.newer/evenlen
+RUN: llvm-ar rU %t.a %t.newer/evenlen
 RUN: llvm-ar p %t.a | FileCheck --check-prefix=NEWER %s
 
 Check that without the 'u' option the member is replaced with an older file.
-RUN: llvm-ar r %t.a %t.older/evenlen
+RUN: llvm-ar rU %t.a %t.older/evenlen
 RUN: llvm-ar p %t.a | FileCheck --check-prefix=OLDER %s
 
 Check that with the 'u' option the member is replaced with a newer file.
-RUN: llvm-ar ru %t.a %t.newer/evenlen
+RUN: llvm-ar ruU %t.a %t.newer/evenlen
 RUN: llvm-ar p %t.a | FileCheck --check-prefix=NEWER %s
 
 Check that with the 'u' option the member is not replaced with an older file.
-RUN: llvm-ar ru %t.a %t.older/evenlen
+RUN: llvm-ar ruU %t.a %t.older/evenlen
 RUN: llvm-ar p %t.a | FileCheck --check-prefix=NEWER %s
 
 NEWER: newer
diff --git a/test/Object/coff-archive.test b/test/Object/coff-archive.test
index 239a96b4c351..c8051ebe1bf3 100644
--- a/test/Object/coff-archive.test
+++ b/test/Object/coff-archive.test
@@ -66,7 +66,26 @@ CHECKIDX: 00000000 N .debug$T
 CHECKIDX: 00000000 i .drectve
 CHECKIDX: 00000001 a @feat.00
 CHECKIDX: 00ab9d1b a @comp.id
+
 CHECKIDX: Debug\mymath.obj:
+CHECKIDX:          U ??2@YAPAXI@Z
+CHECKIDX:          U ??3@YAXPAX@Z
+CHECKIDX:          U ??_7type_info@@6B@
+CHECKIDX:          w ??_Einvalid_argument@std@@UAEPAXI@Z
+CHECKIDX:          w ??_Elogic_error@std@@UAEPAXI@Z
+CHECKIDX:          U ??_Ginvalid_argument@std@@UAEPAXI@Z
+CHECKIDX:          U ??_Glogic_error@std@@UAEPAXI@Z
+CHECKIDX:          U ?what@exception@std@@UBEPBDXZ
+CHECKIDX:          U @__security_check_cookie@4
+CHECKIDX:          U __CxxThrowException@8
+CHECKIDX:          U __RTC_CheckEsp
+CHECKIDX:          U __RTC_InitBase
+CHECKIDX:          U __RTC_Shutdown
+CHECKIDX:          U ___CxxFrameHandler3
+CHECKIDX:          U ___security_cookie
+CHECKIDX:          U __fltused
+CHECKIDX:          U __imp_??0exception@std@@QAE@ABQBD@Z
+CHECKIDX:          U __imp_??1exception@std@@UAE@XZ
 CHECKIDX: 00000000 d .data
 CHECKIDX: 00000000 d .data
 CHECKIDX: 00000000 d .data
@@ -205,21 +224,3 @@ CHECKIDX: 00000004 R ??_7logic_error@std@@6B@
 CHECKIDX: 00000008 r __ehfuncinfo$?Divide@MyMathFuncs@MathFuncs@@SANNN@Z
 CHECKIDX: 0000000e t __ehhandler$?Divide@MyMathFuncs@MathFuncs@@SANNN@Z
 CHECKIDX: 00ab9d1b a @comp.id
-CHECKIDX:          U ??2@YAPAXI@Z
-CHECKIDX:          U ??3@YAXPAX@Z
-CHECKIDX:          U ??_7type_info@@6B@
-CHECKIDX:          w ??_Einvalid_argument@std@@UAEPAXI@Z
-CHECKIDX:          w ??_Elogic_error@std@@UAEPAXI@Z
-CHECKIDX:          U ??_Ginvalid_argument@std@@UAEPAXI@Z
-CHECKIDX:          U ??_Glogic_error@std@@UAEPAXI@Z
-CHECKIDX:          U ?what@exception@std@@UBEPBDXZ
-CHECKIDX:          U @__security_check_cookie@4
-CHECKIDX:          U __CxxThrowException@8
-CHECKIDX:          U __RTC_CheckEsp
-CHECKIDX:          U __RTC_InitBase
-CHECKIDX:          U __RTC_Shutdown
-CHECKIDX:          U ___CxxFrameHandler3
-CHECKIDX:          U ___security_cookie
-CHECKIDX:          U __fltused
-CHECKIDX:          U __imp_??0exception@std@@QAE@ABQBD@Z
-CHECKIDX:          U __imp_??1exception@std@@UAE@XZ
diff --git a/test/Object/coff-invalid.test b/test/Object/coff-invalid.test
new file mode 100644
index 000000000000..b85543dcfe83
--- /dev/null
+++ b/test/Object/coff-invalid.test
@@ -0,0 +1,13 @@
+RUN: llvm-readobj -s %p/Inputs/invalid-bad-section-address.coff 2>&1 | \
+RUN: FileCheck --check-prefix=SECTIONS %s
+
+SECTIONS:      Section {
+SECTIONS-NEXT:   Number: 1
+SECTIONS-NEXT:   Name: .text (2E 74 65 78 74 00 00 00)
+SECTIONS-NEXT:   VirtualSize: 0x0
+SECTIONS-NEXT:   VirtualAddress: 0x1000000
+
+RUN: not llvm-readobj -r %p/Inputs/invalid-bad-section-address.coff 2>&1 | \
+RUN: FileCheck %s
+
+CHECK: Sections with relocations should have an address of 0
diff --git a/test/Object/no-section-table.test b/test/Object/no-section-table.test
new file mode 100644
index 000000000000..77fb98a4d406
--- /dev/null
+++ b/test/Object/no-section-table.test
@@ -0,0 +1,36 @@
+RUN: llvm-readobj %p/Inputs/no-section-table.so -hash-table -dynamic-table \
+RUN:   | FileCheck %s
+
+CHECK: DynamicSection [ (24 entries)
+CHECK:   Tag                Type                 Name/Value
+CHECK:   0x0000000000000001 NEEDED               SharedLibrary (libc.so.6)
+CHECK:   0x000000000000000C INIT                 0x4B8
+CHECK:   0x000000000000000D FINI                 0x618
+CHECK:   0x0000000000000019 INIT_ARRAY           0x2006C0
+CHECK:   0x000000000000001B INIT_ARRAYSZ         8 (bytes)
+CHECK:   0x000000000000001A FINI_ARRAY           0x2006C8
+CHECK:   0x000000000000001C FINI_ARRAYSZ         8 (bytes)
+CHECK:   0x0000000000000004 HASH                 0x158
+CHECK:   0x0000000000000005 STRTAB               0x2D8
+CHECK:   0x0000000000000006 SYMTAB               0x1A0
+CHECK:   0x000000000000000A STRSZ                179 (bytes)
+CHECK:   0x000000000000000B SYMENT               24 (bytes)
+CHECK:   0x0000000000000003 PLTGOT               0x2008C0
+CHECK:   0x0000000000000002 PLTRELSZ             48 (bytes)
+CHECK:   0x0000000000000014 PLTREL               RELA
+CHECK:   0x0000000000000017 JMPREL               0x488
+CHECK:   0x0000000000000007 RELA                 0x3C8
+CHECK:   0x0000000000000008 RELASZ               192 (bytes)
+CHECK:   0x0000000000000009 RELAENT              24 (bytes)
+CHECK:   0x000000006FFFFFFE VERNEED              0x3A8
+CHECK:   0x000000006FFFFFFF VERNEEDNUM           1
+CHECK:   0x000000006FFFFFF0 VERSYM               0x38C
+CHECK:   0x000000006FFFFFF9 unknown              0x3
+CHECK:   0x0000000000000000 NULL                 0x0
+CHECK: ]
+CHECK: HashTable {
+CHECK:   Num Buckets: 3
+CHECK:   Num Chains: 13
+CHECK:   Buckets: [12, 10, 11]
+CHECK:   Chains: [0, 0, 0, 0, 2, 3, 4, 0, 7, 5, 6, 8, 9]
+CHECK: }
diff --git a/test/Object/obj2yaml.test b/test/Object/obj2yaml.test
index 08000f66581b..8054b23eb560 100644
--- a/test/Object/obj2yaml.test
+++ b/test/Object/obj2yaml.test
@@ -234,7 +234,7 @@ ELF-MIPSEL-NEXT:   - Name:            .bss
 ELF-MIPSEL-NEXT:     Type:            SHT_NOBITS
 ELF-MIPSEL-NEXT:     Flags:           [ SHF_WRITE, SHF_ALLOC ]
 ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000004
-ELF-MIPSEL-NEXT:     Content:         48656C6C
+ELF-MIPSEL-NEXT:     Size:            0x0000000000000004
 ELF-MIPSEL-NEXT:   - Name:            .mdebug.abi32
 ELF-MIPSEL-NEXT:     Type:            SHT_PROGBITS
 ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000001
@@ -324,7 +324,6 @@ ELF-MIPS64EL-NEXT:   - Name:            .bss
 ELF-MIPS64EL-NEXT:     Type:            SHT_NOBITS
 ELF-MIPS64EL-NEXT:     Flags:           [ SHF_WRITE, SHF_ALLOC ]
 ELF-MIPS64EL-NEXT:     AddressAlign:    0x0000000000000010
-ELF-MIPS64EL-NEXT:     Content:         ''
 ELF-MIPS64EL-NEXT:   - Name:            .MIPS.options
 ELF-MIPS64EL-NEXT:     Type:            SHT_MIPS_OPTIONS
 ELF-MIPS64EL-NEXT:     Flags:           [ SHF_ALLOC ]
diff --git a/test/Object/yaml2obj-elf-alignment.yaml b/test/Object/yaml2obj-elf-alignment.yaml
new file mode 100644
index 000000000000..8f2f985177f7
--- /dev/null
+++ b/test/Object/yaml2obj-elf-alignment.yaml
@@ -0,0 +1,53 @@
+# Check that yaml2obj takes in account section AddressAlign field.
+
+# RUN: yaml2obj -format=elf %s > %t
+# RUN: llvm-readobj -s %t | FileCheck %s
+
+# CHECK:      Section {
+# CHECK:        Index: 2
+# CHECK-NEXT:   Name: .data
+# CHECK-NEXT:   Type: SHT_PROGBITS
+# CHECK-NEXT:   Flags [
+# CHECK-NEXT:     SHF_ALLOC
+# CHECK-NEXT:     SHF_WRITE
+# CHECK-NEXT:   ]
+# CHECK-NEXT:   Address: 0x0
+# CHECK-NEXT:   Offset: 0x{{[0-9A-F]*}}00
+# CHECK-NEXT:   Size: 4
+# CHECK-NEXT:   Link: 0
+# CHECK-NEXT:   Info: 0
+# CHECK-NEXT:   AddressAlignment: 256
+# CHECK-NEXT:   EntrySize: 0
+# CHECK-NEXT: }
+
+---
+FileHeader:
+  Class:    ELFCLASS32
+  Data:     ELFDATA2LSB
+  Type:     ET_REL
+  Machine:  EM_MIPS
+  Flags:    [ EF_MIPS_CPIC, EF_MIPS_ABI_O32, EF_MIPS_ARCH_32 ]
+
+Sections:
+  - Name:          .text
+    Type:          SHT_PROGBITS
+    Flags:         [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:  8
+    Size:          4
+  - Name:          .data
+    Type:          SHT_PROGBITS
+    Flags:         [ SHF_WRITE, SHF_ALLOC ]
+    AddressAlign:  256
+    Size:          4
+
+Symbols:
+  Global:
+    - Name:     T0
+      Type:     STT_FUNC
+      Section:  .text
+      Size:     4
+    - Name:     D0
+      Type:     STT_OBJECT
+      Section:  .data
+      Size:     4
+...
diff --git a/test/Object/yaml2obj-elf-rel-noref.yaml b/test/Object/yaml2obj-elf-rel-noref.yaml
index 69fcf0854434..4a13acd1fd38 100644
--- a/test/Object/yaml2obj-elf-rel-noref.yaml
+++ b/test/Object/yaml2obj-elf-rel-noref.yaml
@@ -32,7 +32,7 @@ Sections:
     Type:            SHT_NOBITS
     Flags:           [ SHF_WRITE, SHF_ALLOC ]
     AddressAlign:    0x0000000000000001
-    Content:         ''
+    Size:            0
   - Name:            .ARM.attributes
     Type:            SHT_ARM_ATTRIBUTES
     AddressAlign:    0x0000000000000001
diff --git a/test/Object/yaml2obj-elf-rel.yaml b/test/Object/yaml2obj-elf-rel.yaml
index 6a7ed459eff2..ba3640cfefad 100644
--- a/test/Object/yaml2obj-elf-rel.yaml
+++ b/test/Object/yaml2obj-elf-rel.yaml
@@ -75,7 +75,7 @@ Symbols:
 # CHECK-NEXT:     Flags [ (0x0)
 # CHECK-NEXT:     ]
 # CHECK-NEXT:     Address: 0x0
-# CHECK-NEXT:     Offset: 0x160
+# CHECK-NEXT:     Offset: 0x{{[0-9A-F]+}}
 # CHECK-NEXT:     Size: 24
 # CHECK-NEXT:     Link: 4
 # CHECK-NEXT:     Info: 1
@@ -89,7 +89,7 @@ Symbols:
 # CHECK-NEXT:     Flags [ (0x0)
 # CHECK-NEXT:     ]
 # CHECK-NEXT:     Address: 0x0
-# CHECK-NEXT:     Offset: 0x180
+# CHECK-NEXT:     Offset: 0x{{[0-9A-F]+}}
 # CHECK-NEXT:     Size: 36
 # CHECK-NEXT:     Link: 4
 # CHECK-NEXT:     Info: 1
diff --git a/test/Object/yaml2obj-elf-section-basic.yaml b/test/Object/yaml2obj-elf-section-basic.yaml
index 56a3fd6e5f18..69d3ae91b717 100644
--- a/test/Object/yaml2obj-elf-section-basic.yaml
+++ b/test/Object/yaml2obj-elf-section-basic.yaml
@@ -52,7 +52,7 @@ Sections:
 # CHECK-NEXT:       SHF_ALLOC (0x2)
 # CHECK-NEXT:     ]
 # CHECK-NEXT:     Address: 0xCAFECAFE
-# CHECK-NEXT:     Offset: 0x1D0
+# CHECK-NEXT:     Offset: 0x{{[0-9A-F]+}}
 # CHECK-NEXT:     Size: 8
 # CHECK-NEXT:     Link: 0
 # CHECK-NEXT:     Info: 0
@@ -65,12 +65,15 @@ Sections:
 # CHECK:        Section {
 # CHECK:          Name: .symtab (25)
 # CHECK:          Type: SHT_SYMTAB (0x2)
+# CHECK:          AddressAlignment: 8
 # CHECK:        }
 # CHECK:        Section {
 # CHECK:          Name: .strtab (17)
 # CHECK:          Type: SHT_STRTAB (0x3)
+# CHECK:          AddressAlignment: 1
 # CHECK:        }
 # CHECK:        Section {
 # CHECK:          Name: .shstrtab (7)
 # CHECK:          Type: SHT_STRTAB (0x3)
+# CHECK:          AddressAlignment: 1
 # CHECK:        }
diff --git a/test/Object/yaml2obj-elf-symbol-basic.yaml b/test/Object/yaml2obj-elf-symbol-basic.yaml
index 6d49ddd1c422..b17c0429bd47 100644
--- a/test/Object/yaml2obj-elf-symbol-basic.yaml
+++ b/test/Object/yaml2obj-elf-symbol-basic.yaml
@@ -9,6 +9,7 @@ Sections:
   - Name: .text
     Type: SHT_PROGBITS
     Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign: 0x4
     Content: "90EBFE" # x86 machine code
                       #   NOP ; To make main's `Value` non-zero (for testing).
                       # main:
diff --git a/test/Other/extract.ll b/test/Other/extract.ll
index 8b0c835d5746..08675d8bff31 100644
--- a/test/Other/extract.ll
+++ b/test/Other/extract.ll
@@ -7,19 +7,22 @@
 ; llvm-extract uses lazy bitcode loading, so make sure it correctly reads
 ; from bitcode files in addition to assembly files.
 
-; CHECK: define hidden void @foo() {
+; CHECK: define hidden void @foo() comdat($x) {
 ; CHECK:   ret void
 ; CHECK: }
 
 ; The private linkage for foo() should be changed to external linkage and
 ; hidden visibility added.
 ; DELETE: declare hidden void @foo()
+; DELETE-NOT: comdat
 ; DELETE: define void @bar() {
 ; DELETE:   call void @foo()
 ; DELETE:   ret void
 ; DELETE: }
 
-define private void @foo() {
+$x = comdat any
+
+define private void @foo() comdat($x) {
   ret void
 }
 define void @bar() {
diff --git a/test/Transforms/EliminateAvailableExternally/visibility.ll b/test/Transforms/EliminateAvailableExternally/visibility.ll
new file mode 100644
index 000000000000..9966fcf30e85
--- /dev/null
+++ b/test/Transforms/EliminateAvailableExternally/visibility.ll
@@ -0,0 +1,11 @@
+; RUN: opt -elim-avail-extern -S < %s | FileCheck %s
+
+; CHECK: declare hidden void @f()
+define available_externally hidden void @f() {
+  ret void
+}
+
+define void @g() {
+  call void @f()
+  ret void
+}
diff --git a/test/Transforms/GVN/pre-new-inst.ll b/test/Transforms/GVN/pre-new-inst.ll
new file mode 100644
index 000000000000..238b8a687ccc
--- /dev/null
+++ b/test/Transforms/GVN/pre-new-inst.ll
@@ -0,0 +1,29 @@
+; RUN: opt -basicaa -gvn -S %s | FileCheck %s
+
+%MyStruct = type { i32, i32 }
+define i8 @foo(i64 %in, i8* %arr) {
+  %addr = alloca %MyStruct
+  %dead = trunc i64 %in to i32
+  br i1 undef, label %next, label %tmp
+
+tmp:
+  call void @bar()
+  br label %next
+
+next:
+  %addr64 = bitcast %MyStruct* %addr to i64*
+  store i64 %in, i64* %addr64
+  br label %final
+
+final:
+  %addr32 = getelementptr %MyStruct, %MyStruct* %addr, i32 0, i32 0
+  %idx32 = load i32, i32* %addr32
+
+; CHECK: %resptr = getelementptr i8, i8* %arr, i32 %dead
+  %resptr = getelementptr i8, i8* %arr, i32 %idx32
+  %res = load i8, i8* %resptr
+
+  ret i8 %res
+}
+
+declare void @bar()
diff --git a/test/Transforms/IndVarSimplify/lrev-existing-umin.ll b/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
new file mode 100644
index 000000000000..961c9fd944d9
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/lrev-existing-umin.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+define void @f(i32 %length.i.88, i32 %length.i, i8* %tmp12, i32 %tmp10, i8* %tmp8) {
+; CHECK-LABEL: @f(
+not_zero11.preheader:
+  %tmp13 = icmp ugt i32 %length.i, %length.i.88
+  %tmp14 = select i1 %tmp13, i32 %length.i.88, i32 %length.i
+  %tmp15 = icmp sgt i32 %tmp14, 0
+  br i1 %tmp15, label %not_zero11, label %not_zero11.postloop
+
+not_zero11:
+  %v_1 = phi i32 [ %tmp22, %not_zero11 ], [ 0, %not_zero11.preheader ]
+  %tmp16 = zext i32 %v_1 to i64
+  %tmp17 = getelementptr inbounds i8, i8* %tmp8, i64 %tmp16
+  %tmp18 = load i8, i8* %tmp17, align 1
+  %tmp19 = zext i8 %tmp18 to i32
+  %tmp20 = or i32 %tmp19, %tmp10
+  %tmp21 = trunc i32 %tmp20 to i8
+  %addr22 = getelementptr inbounds i8, i8* %tmp12, i64 %tmp16
+  store i8 %tmp21, i8* %addr22, align 1
+  %tmp22 = add nuw nsw i32 %v_1, 1
+  %tmp23 = icmp slt i32 %tmp22, %tmp14
+  br i1 %tmp23, label %not_zero11, label %main.exit.selector
+
+main.exit.selector:
+; CHECK-LABEL: main.exit.selector:
+; CHECK: %tmp24 = icmp slt i32 %tmp14, %length.i
+  %tmp24 = icmp slt i32 %tmp22, %length.i
+  br i1 %tmp24, label %not_zero11.postloop, label %leave
+
+leave:
+  ret void
+
+not_zero11.postloop:
+  ret void
+}
diff --git a/test/Transforms/Inline/frameescape.ll b/test/Transforms/Inline/frameescape.ll
index fb336024f937..6615fe9a76e4 100644
--- a/test/Transforms/Inline/frameescape.ll
+++ b/test/Transforms/Inline/frameescape.ll
@@ -1,13 +1,13 @@
 ; RUN: opt -inline -S < %s | FileCheck %s
 
-; PR23216: We can't inline functions using llvm.frameescape.
+; PR23216: We can't inline functions using llvm.localescape.
 
-declare void @llvm.frameescape(...)
+declare void @llvm.localescape(...)
 declare i8* @llvm.frameaddress(i32)
-declare i8* @llvm.framerecover(i8*, i8*, i32)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
 
 define internal void @foo(i8* %fp) {
-  %a.i8 = call i8* @llvm.framerecover(i8* bitcast (i32 ()* @bar to i8*), i8* %fp, i32 0)
+  %a.i8 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @bar to i8*), i8* %fp, i32 0)
   %a = bitcast i8* %a.i8 to i32*
   store i32 42, i32* %a
   ret void
@@ -16,7 +16,7 @@ define internal void @foo(i8* %fp) {
 define internal i32 @bar() {
 entry:
   %a = alloca i32
-  call void (...) @llvm.frameescape(i32* %a)
+  call void (...) @llvm.localescape(i32* %a)
   %fp = call i8* @llvm.frameaddress(i32 0)
   tail call void @foo(i8* %fp)
   %r = load i32, i32* %a
@@ -27,7 +27,7 @@ entry:
 define internal i32 @bar_alwaysinline() alwaysinline {
 entry:
   %a = alloca i32
-  call void (...) @llvm.frameescape(i32* %a)
+  call void (...) @llvm.localescape(i32* %a)
   tail call void @foo(i8* null)
   ret i32 0
 }
diff --git a/test/Transforms/InstCombine/align-external.ll b/test/Transforms/InstCombine/align-external.ll
index ee98a0120179..15f3096105bb 100644
--- a/test/Transforms/InstCombine/align-external.ll
+++ b/test/Transforms/InstCombine/align-external.ll
@@ -3,16 +3,14 @@
 ; Don't assume that external global variables or those with weak linkage have
 ; their preferred alignment. They may only have the ABI minimum alignment.
 
-; CHECK: %s = shl i64 %a, 3
-; CHECK: %r = or i64 %s, ptrtoint (i32* @A to i64)
-; CHECK: %q = add i64 %r, 1
-; CHECK: ret i64 %q
-
 target datalayout = "i32:8:32"
 
 @A = external global i32
 @B = weak_odr global i32 0
 
+@C = available_externally global <4 x i32> zeroinitializer, align 4
+; CHECK: @C = available_externally global <4 x i32> zeroinitializer, align 4
+
 define i64 @foo(i64 %a) {
   %t = ptrtoint i32* @A to i64
   %s = shl i64 %a, 3
@@ -21,9 +19,23 @@ define i64 @foo(i64 %a) {
   ret i64 %q
 }
 
+; CHECK-LABEL: define i64 @foo(i64 %a)
+; CHECK: %s = shl i64 %a, 3
+; CHECK: %r = or i64 %s, ptrtoint (i32* @A to i64)
+; CHECK: %q = add i64 %r, 1
+; CHECK: ret i64 %q
+
 define i32 @bar() {
-; CHECK-LABEL: @bar(
   %r = load i32, i32* @B, align 1
-; CHECK: align 1
   ret i32 %r
 }
+
+; CHECK-LABEL: @bar()
+; CHECK: align 1
+
+define void @vec_store() {
+  store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* @C, align 4
+  ret void
+}
+; CHECK: define void @vec_store()
+; CHECK: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* @C, align 4
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 9767704c85cf..bea063787a75 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -17,6 +17,8 @@ declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
+declare double @llvm.cos.f64(double %Val) nounwind readonly
+declare double @llvm.sin.f64(double %Val) nounwind readonly
 
 define i8 @uaddtest1(i8 %A, i8 %B) {
   %x = call %overflow.result @llvm.uadd.with.overflow.i8(i8 %A, i8 %B)
@@ -425,3 +427,23 @@ define %ov.result.32 @never_overflows_ssub_test0(i32 %a) {
 ; CHECK-NEXT: %[[x:.*]] = insertvalue %ov.result.32 { i32 undef, i1 false }, i32 %a, 0
 ; CHECK-NEXT:  ret %ov.result.32 %[[x]]
 }
+
+define void @cos(double *%P) {
+entry:
+  %B = tail call double @llvm.cos.f64(double 0.0) nounwind
+  store volatile double %B, double* %P
+
+  ret void
+; CHECK-LABEL: @cos(
+; CHECK: store volatile double 1.000000e+00, double* %P
+}
+
+define void @sin(double *%P) {
+entry:
+  %B = tail call double @llvm.sin.f64(double 0.0) nounwind
+  store volatile double %B, double* %P
+
+  ret void
+; CHECK-LABEL: @sin(
+; CHECK: store volatile double 0.000000e+00, double* %P
+}
diff --git a/test/Transforms/InstCombine/load-combine-metadata.ll b/test/Transforms/InstCombine/load-combine-metadata.ll
new file mode 100644
index 000000000000..9b9c1fe607b9
--- /dev/null
+++ b/test/Transforms/InstCombine/load-combine-metadata.ll
@@ -0,0 +1,29 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @test_load_load_combine_metadata(
+; Check that range and AA metadata is combined
+; CHECK: %[[V:.*]] = load i32, i32* %0
+; CHECK-SAME: !tbaa !{{[0-9]+}}
+; CHECK-SAME: !range ![[RANGE:[0-9]+]]
+; CHECK: store i32 %[[V]], i32* %1
+; CHECK: store i32 %[[V]], i32* %2
+define void @test_load_load_combine_metadata(i32*, i32*, i32*) {
+  %a = load i32, i32* %0, !tbaa !8, !range !0, !alias.scope !5, !noalias !6
+  %b = load i32, i32* %0, !tbaa !8, !range !1
+  store i32 %a, i32* %1
+  store i32 %b, i32* %2
+  ret void
+}
+
+; CHECK: ![[RANGE]] = !{i32 0, i32 1, i32 8, i32 9}
+!0 = !{ i32 0, i32 1 }
+!1 = !{ i32 8, i32 9 }
+!2 = !{!2}
+!3 = !{!3, !2}
+!4 = !{!4, !2}
+!5 = !{!3}
+!6 = !{!4}
+!7 = !{ !"tbaa root" }
+!8 = !{ !7, !7, i64 0 }
diff --git a/test/Transforms/InstCombine/load_combine_aa.ll b/test/Transforms/InstCombine/load_combine_aa.ll
new file mode 100644
index 000000000000..b84b81ddd5d9
--- /dev/null
+++ b/test/Transforms/InstCombine/load_combine_aa.ll
@@ -0,0 +1,15 @@
+; RUN: opt -basicaa -instcombine -S < %s | FileCheck %s
+
+; CHECK-LABEL: @test_load_combine_aa(
+; CHECK: %[[V:.*]] = load i32, i32* %0
+; CHECK: store i32 0, i32* %3
+; CHECK: store i32 %[[V]], i32* %1
+; CHECK: store i32 %[[V]], i32* %2
+define void @test_load_combine_aa(i32*, i32*, i32*, i32* noalias) {
+  %a = load i32, i32* %0
+  store i32 0, i32* %3
+  %b = load i32, i32* %0
+  store i32 %a, i32* %1
+  store i32 %b, i32* %2
+  ret void
+}
diff --git a/test/Transforms/InstSimplify/2011-09-05-InsertExtractValue.ll b/test/Transforms/InstSimplify/2011-09-05-InsertExtractValue.ll
index 885cb70007e6..7e391aba3045 100644
--- a/test/Transforms/InstSimplify/2011-09-05-InsertExtractValue.ll
+++ b/test/Transforms/InstSimplify/2011-09-05-InsertExtractValue.ll
@@ -27,3 +27,12 @@ define { i8, i32 } @test2({ i8*, i32 } %x) {
   ret { i8, i32 } %ins
 ; CHECK-LABEL: @test2(
 }
+
+define i32 @test3(i32 %a, float %b) {
+  %agg1 = insertvalue {i32, float} undef, i32 %a, 0
+  %agg2 = insertvalue {i32, float} %agg1, float %b, 1
+  %ev = extractvalue {i32, float} %agg2, 0
+  ret i32 %ev
+; CHECK-LABEL: @test3(
+; CHECK: ret i32 %a
+}
diff --git a/test/Transforms/InstSimplify/floating-point-compare.ll b/test/Transforms/InstSimplify/floating-point-compare.ll
index af48d062b4f6..8174f5834533 100644
--- a/test/Transforms/InstSimplify/floating-point-compare.ll
+++ b/test/Transforms/InstSimplify/floating-point-compare.ll
@@ -58,3 +58,18 @@ define i1 @orderedLessZeroPowi(double,double) {
   ret i1 %olt
 }
 
+define i1 @nonans1(double %in1, double %in2) {
+  %cmp = fcmp nnan uno double %in1, %in2
+  ret i1 %cmp
+
+; CHECK-LABEL: @nonans1
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @nonans2(double %in1, double %in2) {
+  %cmp = fcmp nnan ord double %in1, %in2
+  ret i1 %cmp
+
+; CHECK-LABEL: @nonans2
+; CHECK-NEXT: ret i1 true
+}
diff --git a/test/Transforms/InstSimplify/undef.ll b/test/Transforms/InstSimplify/undef.ll
index f1f0b037fdbd..d75dc364243c 100644
--- a/test/Transforms/InstSimplify/undef.ll
+++ b/test/Transforms/InstSimplify/undef.ll
@@ -265,3 +265,17 @@ define i32 @test34(i32 %a) {
   %b = lshr i32 undef, 0
   ret i32 %b
 }
+
+; CHECK-LABEL: @test35
+; CHECK: ret i32 undef
+define i32 @test35(<4 x i32> %V) {
+  %b = extractelement <4 x i32> %V, i32 4
+  ret i32 %b
+}
+
+; CHECK-LABEL: @test36
+; CHECK: ret i32 undef
+define i32 @test36(i32 %V) {
+  %b = extractelement <4 x i32> undef, i32 %V
+  ret i32 %b
+}
diff --git a/test/Transforms/LICM/PR24013.ll b/test/Transforms/LICM/PR24013.ll
new file mode 100644
index 000000000000..4557bfcfd122
--- /dev/null
+++ b/test/Transforms/LICM/PR24013.ll
@@ -0,0 +1,19 @@
+; RUN: opt -licm -S < %s | FileCheck %s
+
+define void @f(i1 zeroext %p1) {
+; CHECK-LABEL: @f(
+entry:
+  br label %lbl
+
+lbl.loopexit:                                     ; No predecessors!
+  br label %lbl
+
+lbl:                                              ; preds = %lbl.loopexit, %entry
+  %phi = phi i32 [ %conv, %lbl.loopexit ], [ undef, %entry ]
+; CHECK: phi i32 [ undef, {{.*}} ], [ undef
+  br label %if.then.5
+
+if.then.5:                                        ; preds = %if.then.5, %lbl
+  %conv = zext i1 undef to i32
+  br label %if.then.5
+}
diff --git a/test/Transforms/LoopDistribute/basic-with-memchecks.ll b/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index fde06d33c5a5..3aced4850411 100644
--- a/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -32,8 +32,9 @@ entry:
   %e = load i32*, i32** @E, align 8
   br label %for.body
 
-; We have two compares for each array overlap check which is a total of 10
-; compares.
+; We have two compares for each array overlap check.
+; Since the checks to A and A + 4 get merged, this will give us a
+; total of 8 compares.
 ;
 ; CHECK: for.body.lver.memcheck:
 ; CHECK:     = icmp
@@ -48,9 +49,6 @@ entry:
 ; CHECK:     = icmp
 ; CHECK:     = icmp
 
-; CHECK:     = icmp
-; CHECK:     = icmp
-
 ; CHECK-NOT: = icmp
 ; CHECK:     br i1 %memcheck.conflict, label %for.body.ph.lver.orig, label %for.body.ph.ldist1
 
diff --git a/test/Transforms/LoopIdiom/ctpop-multiple-users-crash.ll b/test/Transforms/LoopIdiom/ctpop-multiple-users-crash.ll
new file mode 100644
index 000000000000..ddb7bdbe7d19
--- /dev/null
+++ b/test/Transforms/LoopIdiom/ctpop-multiple-users-crash.ll
@@ -0,0 +1,34 @@
+; RUN: opt -loop-idiom -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios8.0.0"
+
+; When we replace the precondition with a ctpop, we need to ensure
+; that only the first branch reads the ctpop.  The store prior
+; to that should continue to read from the original compare.
+
+; CHECK: %tobool.5 = icmp ne i32 %num, 0
+; CHECK: store i1 %tobool.5, i1* %ptr
+
+define internal fastcc i32 @num_bits_set(i32 %num, i1* %ptr) #1 {
+entry:
+  %tobool.5 = icmp ne i32 %num, 0
+  store i1 %tobool.5, i1* %ptr
+  br i1 %tobool.5, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %count.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %num.addr.06 = phi i32 [ %num, %for.body.lr.ph ], [ %and, %for.body ]
+  %sub = add i32 %num.addr.06, -1
+  %and = and i32 %sub, %num.addr.06
+  %inc = add nsw i32 %count.07, 1
+  %tobool = icmp ne i32 %and, 0
+  br i1 %tobool, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %count.0.lcssa = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  ret i32 %count.0.lcssa
+}
+\ No newline at end of file
diff --git a/test/Transforms/LoopRotate/oz-disable.ll b/test/Transforms/LoopRotate/oz-disable.ll
new file mode 100644
index 000000000000..7a6a9bf33a18
--- /dev/null
+++ b/test/Transforms/LoopRotate/oz-disable.ll
@@ -0,0 +1,30 @@
+; REQUIRES: asserts
+; RUN: opt < %s -S -Os -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OS
+; RUN: opt < %s -S -Oz -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OZ
+
+; Loop should be rotated for -Os but not for -Oz.
+; OS: rotating Loop at depth 1
+; OZ-NOT: rotating Loop at depth 1
+
+@e = global i32 10
+
+declare void @use(i32)
+
+define void @test() {
+entry:
+  %end = load i32, i32* @e
+  br label %loop
+
+loop:
+  %n.phi = phi i32 [ %n, %loop.fin ], [ 0, %entry ]
+  %cond = icmp eq i32 %n.phi, %end
+  br i1 %cond, label %exit, label %loop.fin
+
+loop.fin:
+  %n = add i32 %n.phi, 1
+  call void @use(i32 %n)
+  br label %loop
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/ephemeral.ll b/test/Transforms/LoopStrengthReduce/ephemeral.ll
new file mode 100644
index 000000000000..a0d1d44b1bac
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/ephemeral.ll
@@ -0,0 +1,41 @@
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+
+; for (int i = 0; i < n; ++i) {
+;   use(i * 5 + 3);
+;   // i * a + b is ephemeral and shouldn't be promoted by LSR
+;   __builtin_assume(i * a + b >= 0);
+; }
+define void @ephemeral(i32 %a, i32 %b, i32 %n) {
+; CHECK-LABEL: @ephemeral(
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  ; Only i and i * 5 + 3 should be indvars, not i * a + b.
+; CHECK: phi i32
+; CHECK: phi i32
+; CHECK-NOT: phi i32
+  %inc = add nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, %n
+
+  %0 = mul nsw i32 %i, 5
+  %1 = add nsw i32 %0, 3
+  call void @use(i32 %1)
+
+  %2 = mul nsw i32 %i, %a
+  %3 = add nsw i32 %2, %b
+  %4 = icmp sgt i32 %3, -1
+  call void @llvm.assume(i1 %4)
+
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare void @use(i32)
+
+declare void @llvm.assume(i1)
diff --git a/test/Transforms/LoopUnroll/unroll-pragmas.ll b/test/Transforms/LoopUnroll/unroll-pragmas.ll
index 1354181becd0..8e0d77513cc1 100644
--- a/test/Transforms/LoopUnroll/unroll-pragmas.ll
+++ b/test/Transforms/LoopUnroll/unroll-pragmas.ll
@@ -86,9 +86,9 @@ for.end:                                          ; preds = %for.body
 ; #pragma clang loop unroll(full)
 ; Loop should be fully unrolled.
 ;
-; CHECK-LABEL: @loop64_with_enable(
+; CHECK-LABEL: @loop64_with_full(
 ; CHECK-NOT: br i1
-define void @loop64_with_enable(i32* nocapture %a) {
+define void @loop64_with_full(i32* nocapture %a) {
 entry:
   br label %for.body
 
@@ -139,14 +139,13 @@ for.end:                                          ; preds = %for.body
 !6 = !{!"llvm.loop.unroll.count", i32 4}
 
 ; #pragma clang loop unroll(full)
-; Full unrolling is requested, but loop has a dynamic trip count so
+; Full unrolling is requested, but loop has a runtime trip count so
 ; no unrolling should occur.
 ;
-; CHECK-LABEL: @dynamic_loop_with_enable(
+; CHECK-LABEL: @runtime_loop_with_full(
 ; CHECK: store i32
 ; CHECK-NOT: store i32
-; CHECK: br i1
-define void @dynamic_loop_with_enable(i32* nocapture %a, i32 %b) {
+define void @runtime_loop_with_full(i32* nocapture %a, i32 %b) {
 entry:
   %cmp3 = icmp sgt i32 %b, 0
   br i1 %cmp3, label %for.body, label %for.end, !llvm.loop !8
@@ -168,22 +167,22 @@ for.end:                                          ; preds = %for.body, %entry
 !8 = !{!8, !4}
 
 ; #pragma clang loop unroll_count(4)
-; Loop has a dynamic trip count.  Unrolling should occur, but no
-; conditional branches can be removed.
+; Loop has a runtime trip count.  Runtime unrolling should occur and loop
+; should be duplicated (original and 4x unrolled).
 ;
-; CHECK-LABEL: @dynamic_loop_with_count4(
+; CHECK-LABEL: @runtime_loop_with_count4(
+; CHECK: for.body.prol:
+; CHECK: store
 ; CHECK-NOT: store
 ; CHECK: br i1
+; CHECK: for.body
 ; CHECK: store
-; CHECK: br i1
 ; CHECK: store
-; CHECK: br i1
 ; CHECK: store
-; CHECK: br i1
 ; CHECK: store
+; CHECK-NOT: store
 ; CHECK: br i1
-; CHECK-NOT: br i1
-define void @dynamic_loop_with_count4(i32* nocapture %a, i32 %b) {
+define void @runtime_loop_with_count4(i32* nocapture %a, i32 %b) {
 entry:
   %cmp3 = icmp sgt i32 %b, 0
   br i1 %cmp3, label %for.body, label %for.end, !llvm.loop !9
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
index 1c21748d8bdd..8640950be32e 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
@@ -9,9 +9,9 @@
 ; DEBUG-OUTPUT-NOT: .loc
 ; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
 
-; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization factor: 4, unrolling interleave factor: 1)
-; UNROLLED: remark: vectorization-remarks.c:17:8: unrolled with interleaving factor 4 (vectorization not beneficial)
 ; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vector width and interleave count are explicitly set to 1
+; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
+; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved by 4 (vectorization not beneficial)
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/test/Transforms/PlaceSafepoints/statepoint-frameescape.ll b/test/Transforms/PlaceSafepoints/statepoint-frameescape.ll
index a6ee5ee078fb..c4e250957a8f 100644
--- a/test/Transforms/PlaceSafepoints/statepoint-frameescape.ll
+++ b/test/Transforms/PlaceSafepoints/statepoint-frameescape.ll
@@ -1,17 +1,17 @@
 ; RUN: opt %s -S -place-safepoints | FileCheck %s
 
-declare void @llvm.frameescape(...)
+declare void @llvm.localescape(...)
 
-; Do we insert the entry safepoint after the frameescape intrinsic?
+; Do we insert the entry safepoint after the localescape intrinsic?
 define void @parent() gc "statepoint-example" {
 ; CHECK-LABEL: @parent
 entry:
 ; CHECK-LABEL: entry
 ; CHECK-NEXT: alloca
-; CHECK-NEXT: frameescape
+; CHECK-NEXT: localescape
 ; CHECK-NEXT: statepoint
   %ptr = alloca i32
-  call void (...) @llvm.frameescape(i32* %ptr)
+  call void (...) @llvm.localescape(i32* %ptr)
   ret void
 }
 
diff --git a/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll b/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
index 9ed86f881473..35763953911b 100644
--- a/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
+++ b/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
@@ -1,4 +1,9 @@
 ; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -slp-vectorizer -dce < %s | FileCheck %s
+; XFAIL: *
+; 
+; FIXME: If this test expects to be vectorized, the TTI must indicate that the target
+;        has vector registers of the expected width.
+;        Currently, it says there are 8 vector registers that are 32-bits wide.
 
 target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
 
diff --git a/test/Transforms/SLPVectorizer/X86/cse.ll b/test/Transforms/SLPVectorizer/X86/cse.ll
index 9f56e2195991..8d25b3661dc3 100644
--- a/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -12,11 +12,8 @@ target triple = "i386-apple-macosx10.8.0"
 
 ;CHECK-LABEL: @test(
 ;CHECK: load <2 x double>
-;CHECK: fadd <2 x double>
-;CHECK: store <2 x double>
-;CHECK: insertelement <2 x double>
-;CHECK: fadd <2 x double>
-;CHECK: store <2 x double>
+;CHECK: fadd <4 x double>
+;CHECK: store <4 x double>
 ;CHECK: ret i32
 
 define i32 @test(double* nocapture %G) {
@@ -48,11 +45,12 @@ entry:
 ;  A[2] = A[2] * 7.6 * n + 3.0;
 ;  A[3] = A[3] * 7.4 * n + 4.0;
 ;}
-;CHECK-LABEL: @foo(
-;CHECK: insertelement <2 x double>
-;CHECK: insertelement <2 x double>
-;CHECK-NOT: insertelement <2 x double>
-;CHECK: ret
+; CHECK-LABEL: @foo(
+; CHECK: load <4 x double>
+; CHECK: fmul <4 x double>
+; CHECK: fmul <4 x double>
+; CHECK: fadd <4 x double>
+; CHECK: store <4 x double>
 define i32 @foo(double* nocapture %A, i32 %n) {
 entry:
   %0 = load double, double* %A, align 8
@@ -93,7 +91,7 @@ entry:
 ; }
 
 ; We can't merge the gather sequences because one does not dominate the other.
-; CHECK: test2
+; CHECK-LABEL: @test2(
 ; CHECK: insertelement
 ; CHECK: insertelement
 ; CHECK: insertelement
@@ -140,11 +138,12 @@ define i32 @test2(double* nocapture %G, i32 %k) {
 ;  A[2] = A[2] * 7.9 * n + 6.0;
 ;  A[3] = A[3] * 7.9 * n + 6.0;
 ;}
-;CHECK-LABEL: @foo4(
-;CHECK: insertelement <2 x double>
-;CHECK: insertelement <2 x double>
-;CHECK-NOT: insertelement <2 x double>
-;CHECK: ret
+; CHECK-LABEL: @foo4(
+; CHECK: load <4 x double>
+; CHECK: fmul <4 x double>
+; CHECK: fmul <4 x double>
+; CHECK: fadd <4 x double>
+; CHECK: store <4 x double>
 define i32 @foo4(double* nocapture %A, i32 %n) {
 entry:
   %0 = load double, double* %A, align 8
diff --git a/test/Transforms/SLPVectorizer/X86/gep.ll b/test/Transforms/SLPVectorizer/X86/gep.ll
index 3f952d7b242b..d10f2b6015d4 100644
--- a/test/Transforms/SLPVectorizer/X86/gep.ll
+++ b/test/Transforms/SLPVectorizer/X86/gep.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -basicaa -slp-vectorizer -S |FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
 
 ; Test if SLP can handle GEP expressions.
 ; The test perform the following action:
diff --git a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
index 0c16c34a1888..dace4b35b871 100644
--- a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
+++ b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
@@ -1,22 +1,19 @@
-; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mcpu=corei7-avx | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
 ;CHECK-LABEL: @foo(
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
+;CHECK: load <8 x i32>
+;CHECK: add nsw <8 x i32>
+;CHECK: store <8 x i32>
 ;CHECK: ret
-define i32 @foo(i32* nocapture %A, i32 %n) #0 {
+define i32 @foo(i32* nocapture %A, i32 %n) {
 entry:
   %cmp62 = icmp sgt i32 %n, 0
   br i1 %cmp62, label %for.body, label %for.end
 
-for.body:                                         ; preds = %entry, %for.body
+for.body:
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
   %0 = load i32, i32* %arrayidx, align 4
@@ -62,8 +59,7 @@ for.body:                                         ; preds = %entry, %for.body
   %cmp = icmp slt i32 %15, %n
   br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.body, %entry
+for.end:
   ret i32 undef
 }
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/SLPVectorizer/X86/pr19657.ll b/test/Transforms/SLPVectorizer/X86/pr19657.ll
index a687aec76103..32f8da4c7ee0 100644
--- a/test/Transforms/SLPVectorizer/X86/pr19657.ll
+++ b/test/Transforms/SLPVectorizer/X86/pr19657.ll
@@ -1,73 +1,45 @@
-; RUN: opt < %s -O1 -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-max-reg-size=128 -S -mcpu=corei7-avx | FileCheck %s --check-prefix=V128
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-;CHECK: load <2 x double>, <2 x double>*
-;CHECK: fadd <2 x double>
-;CHECK: store <2 x double>
+; CHECK-LABEL: @foo(
+; CHECK: load <4 x double>
+; CHECK: fadd <4 x double>
+; CHECK: fadd <4 x double>
+; CHECK: store <4 x double>
 
-; Function Attrs: nounwind uwtable
-define void @foo(double* %x) #0 {
-  %1 = alloca double*, align 8
-  store double* %x, double** %1, align 8
-  %2 = load double*, double** %1, align 8
-  %3 = getelementptr inbounds double, double* %2, i64 0
-  %4 = load double, double* %3, align 8
-  %5 = load double*, double** %1, align 8
-  %6 = getelementptr inbounds double, double* %5, i64 0
-  %7 = load double, double* %6, align 8
-  %8 = fadd double %4, %7
-  %9 = load double*, double** %1, align 8
-  %10 = getelementptr inbounds double, double* %9, i64 0
-  %11 = load double, double* %10, align 8
-  %12 = fadd double %8, %11
-  %13 = load double*, double** %1, align 8
-  %14 = getelementptr inbounds double, double* %13, i64 0
-  store double %12, double* %14, align 8
-  %15 = load double*, double** %1, align 8
-  %16 = getelementptr inbounds double, double* %15, i64 1
-  %17 = load double, double* %16, align 8
-  %18 = load double*, double** %1, align 8
-  %19 = getelementptr inbounds double, double* %18, i64 1
-  %20 = load double, double* %19, align 8
-  %21 = fadd double %17, %20
-  %22 = load double*, double** %1, align 8
-  %23 = getelementptr inbounds double, double* %22, i64 1
-  %24 = load double, double* %23, align 8
-  %25 = fadd double %21, %24
-  %26 = load double*, double** %1, align 8
-  %27 = getelementptr inbounds double, double* %26, i64 1
-  store double %25, double* %27, align 8
-  %28 = load double*, double** %1, align 8
-  %29 = getelementptr inbounds double, double* %28, i64 2
-  %30 = load double, double* %29, align 8
-  %31 = load double*, double** %1, align 8
-  %32 = getelementptr inbounds double, double* %31, i64 2
-  %33 = load double, double* %32, align 8
-  %34 = fadd double %30, %33
-  %35 = load double*, double** %1, align 8
-  %36 = getelementptr inbounds double, double* %35, i64 2
-  %37 = load double, double* %36, align 8
-  %38 = fadd double %34, %37
-  %39 = load double*, double** %1, align 8
-  %40 = getelementptr inbounds double, double* %39, i64 2
-  store double %38, double* %40, align 8
-  %41 = load double*, double** %1, align 8
-  %42 = getelementptr inbounds double, double* %41, i64 3
-  %43 = load double, double* %42, align 8
-  %44 = load double*, double** %1, align 8
-  %45 = getelementptr inbounds double, double* %44, i64 3
-  %46 = load double, double* %45, align 8
-  %47 = fadd double %43, %46
-  %48 = load double*, double** %1, align 8
-  %49 = getelementptr inbounds double, double* %48, i64 3
-  %50 = load double, double* %49, align 8
-  %51 = fadd double %47, %50
-  %52 = load double*, double** %1, align 8
-  %53 = getelementptr inbounds double, double* %52, i64 3
-  store double %51, double* %53, align 8
+; V128-LABEL: @foo(
+; V128: load <2 x double>
+; V128: fadd <2 x double>
+; V128: fadd <2 x double>
+; V128: store <2 x double>
+; V128: load <2 x double>
+; V128: fadd <2 x double>
+; V128: fadd <2 x double>
+; V128: store <2 x double>
+
+define void @foo(double* %x) {
+  %1 = load double, double* %x, align 8
+  %2 = fadd double %1, %1
+  %3 = fadd double %2, %1
+  store double %3, double* %x, align 8
+  %4 = getelementptr inbounds double, double* %x, i64 1
+  %5 = load double, double* %4, align 8
+  %6 = fadd double %5, %5
+  %7 = fadd double %6, %5
+  store double %7, double* %4, align 8
+  %8 = getelementptr inbounds double, double* %x, i64 2
+  %9 = load double, double* %8, align 8
+  %10 = fadd double %9, %9
+  %11 = fadd double %10, %9
+  store double %11, double* %8, align 8
+  %12 = getelementptr inbounds double, double* %x, i64 3
+  %13 = load double, double* %12, align 8
+  %14 = fadd double %13, %13
+  %15 = fadd double %14, %13
+  store double %15, double* %12, align 8
   ret void
 }
 
-attributes #0 = { nounwind }
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
index a59192d718cf..7c8955b28fa2 100644
--- a/test/Transforms/SROA/basictest.ll
+++ b/test/Transforms/SROA/basictest.ll
@@ -1595,3 +1595,14 @@ entry:
   store i32 %load, i32* %a.gep1
   ret void
 }
+
+define void @PR23737() {
+; CHECK-LABEL: @PR23737(
+; CHECK: store atomic volatile {{.*}} seq_cst
+; CHECK: load atomic volatile {{.*}} seq_cst
+entry:
+  %ptr = alloca i64, align 8
+  store atomic volatile i64 0, i64* %ptr seq_cst, align 8
+  %load = load atomic volatile i64, i64* %ptr seq_cst, align 8
+  ret void
+}
diff --git a/test/Verifier/comdat-decl1.ll b/test/Verifier/comdat-decl1.ll
new file mode 100644
index 000000000000..aee56b636270
--- /dev/null
+++ b/test/Verifier/comdat-decl1.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+$v = comdat any
+@v = available_externally global i32 0, comdat
+; CHECK: Declaration may not be in a Comdat!
diff --git a/test/Verifier/comdat-decl2.ll b/test/Verifier/comdat-decl2.ll
new file mode 100644
index 000000000000..fcd3d5d0eeea
--- /dev/null
+++ b/test/Verifier/comdat-decl2.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+$v = comdat any
+@v = external global i32, comdat
+; CHECK: Declaration may not be in a Comdat!
diff --git a/test/Verifier/frameescape.ll b/test/Verifier/frameescape.ll
index 1fb9387eb2ff..074098b990dc 100644
--- a/test/Verifier/frameescape.ll
+++ b/test/Verifier/frameescape.ll
@@ -1,69 +1,69 @@
 ; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
 
-declare void @llvm.frameescape(...)
-declare i8* @llvm.framerecover(i8*, i8*, i32)
+declare void @llvm.localescape(...)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
 
 define internal void @f() {
   %a = alloca i8
-  call void (...) @llvm.frameescape(i8* %a)
-  call void (...) @llvm.frameescape(i8* %a)
+  call void (...) @llvm.localescape(i8* %a)
+  call void (...) @llvm.localescape(i8* %a)
   ret void
 }
-; CHECK: multiple calls to llvm.frameescape in one function
+; CHECK: multiple calls to llvm.localescape in one function
 
 define internal void @g() {
 entry:
   %a = alloca i8
   br label %not_entry
 not_entry:
-  call void (...) @llvm.frameescape(i8* %a)
+  call void (...) @llvm.localescape(i8* %a)
   ret void
 }
-; CHECK: llvm.frameescape used outside of entry block
+; CHECK: llvm.localescape used outside of entry block
 
 define internal void @h() {
-  call i8* @llvm.framerecover(i8* null, i8* null, i32 0)
+  call i8* @llvm.localrecover(i8* null, i8* null, i32 0)
   ret void
 }
-; CHECK: llvm.framerecover first argument must be function defined in this module
+; CHECK: llvm.localrecover first argument must be function defined in this module
 
 @global = constant i8 0
 
 declare void @declaration()
 
 define internal void @i() {
-  call i8* @llvm.framerecover(i8* @global, i8* null, i32 0)
+  call i8* @llvm.localrecover(i8* @global, i8* null, i32 0)
   ret void
 }
-; CHECK: llvm.framerecover first argument must be function defined in this module
+; CHECK: llvm.localrecover first argument must be function defined in this module
 
 define internal void @j() {
-  call i8* @llvm.framerecover(i8* bitcast(void()* @declaration to i8*), i8* null, i32 0)
+  call i8* @llvm.localrecover(i8* bitcast(void()* @declaration to i8*), i8* null, i32 0)
   ret void
 }
-; CHECK: llvm.framerecover first argument must be function defined in this module
+; CHECK: llvm.localrecover first argument must be function defined in this module
 
 define internal void @k(i32 %n) {
-  call i8* @llvm.framerecover(i8* bitcast(void()* @f to i8*), i8* null, i32 %n)
+  call i8* @llvm.localrecover(i8* bitcast(void()* @f to i8*), i8* null, i32 %n)
   ret void
 }
-; CHECK: idx argument of llvm.framerecover must be a constant int
+; CHECK: idx argument of llvm.localrecover must be a constant int
 
 define internal void @l(i8* %b) {
   %a = alloca i8
-  call void (...) @llvm.frameescape(i8* %a, i8* %b)
+  call void (...) @llvm.localescape(i8* %a, i8* %b)
   ret void
 }
-; CHECK: llvm.frameescape only accepts static allocas
+; CHECK: llvm.localescape only accepts static allocas
 
 define internal void @m() {
   %a = alloca i8
-  call void (...) @llvm.frameescape(i8* %a)
+  call void (...) @llvm.localescape(i8* %a)
   ret void
 }
 
 define internal void @n(i8* %fp) {
-  call i8* @llvm.framerecover(i8* bitcast(void ()* @m to i8*), i8* %fp, i32 1)
+  call i8* @llvm.localrecover(i8* bitcast(void ()* @m to i8*), i8* %fp, i32 1)
   ret void
 }
-; CHECK: all indices passed to llvm.framerecover must be less than the number of arguments passed ot llvm.frameescape in the parent function
+; CHECK: all indices passed to llvm.localrecover must be less than the number of arguments passed ot llvm.localescape in the parent function
diff --git a/test/tools/llvm-objdump/macho-sections.test b/test/tools/llvm-objdump/macho-sections.test
index 31efd110a5c8..07c2b52ebfc8 100644
--- a/test/tools/llvm-objdump/macho-sections.test
+++ b/test/tools/llvm-objdump/macho-sections.test
@@ -1,5 +1,3 @@
 # RUN: llvm-objdump -macho -section=__data %p/Inputs/bind2.macho-x86_64 | FileCheck %s
-# RUN: llvm-objdump -macho -section=__data -raw %p/Inputs/bind2.macho-x86_64 | FileCheck --check-prefix=RAW %s
 
 # CHECK: bind2.macho-x86_64:
-# RAW-NOT: bind2.macho-x86_64:
diff --git a/test/tools/llvm-readobj/Inputs/got-plt.exe.elf-mipsel b/test/tools/llvm-readobj/Inputs/got-plt.exe.elf-mipsel
new file mode 100755
index 000000000000..8cdc69ffa562
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/got-plt.exe.elf-mipsel
diff --git a/test/tools/llvm-readobj/codeview-linetables.test b/test/tools/llvm-readobj/codeview-linetables.test
index b2acee1200b9..d124e6e2d454 100644
--- a/test/tools/llvm-readobj/codeview-linetables.test
+++ b/test/tools/llvm-readobj/codeview-linetables.test
@@ -104,6 +104,7 @@ MFUN32-NEXT:     PayloadSize: 0x8
 MFUN32:        ]
 MFUN32-NEXT:   FunctionLineTable [
 MFUN32-NEXT:     FunctionName: _x
+MFUN32-NEXT:     Flags: 0x0
 MFUN32-NEXT:     CodeSize: 0xA
 MFUN32-NEXT:     FilenameSegment [
 MFUN32-NEXT:       Filename: d:\source.c
@@ -114,6 +115,7 @@ MFUN32-NEXT:     ]
 MFUN32-NEXT:   ]
 MFUN32-NEXT:   FunctionLineTable [
 MFUN32-NEXT:     FunctionName: _y
+MFUN32-NEXT:     Flags: 0x0
 MFUN32-NEXT:     CodeSize: 0xA
 MFUN32-NEXT:     FilenameSegment [
 MFUN32-NEXT:       Filename: d:\source.c
@@ -124,6 +126,7 @@ MFUN32-NEXT:     ]
 MFUN32-NEXT:   ]
 MFUN32-NEXT:   FunctionLineTable [
 MFUN32-NEXT:     FunctionName: _f
+MFUN32-NEXT:     Flags: 0x0
 MFUN32-NEXT:     CodeSize: 0x14
 MFUN32-NEXT:     FilenameSegment [
 MFUN32-NEXT:       Filename: d:\source.c
@@ -201,6 +204,7 @@ MFUN64-NEXT:     PayloadSize: 0x8
 MFUN64:        ]
 MFUN64-NEXT:   FunctionLineTable [
 MFUN64-NEXT:     FunctionName: x
+MFUN64-NEXT:     Flags: 0x0
 MFUN64-NEXT:     CodeSize: 0xE
 MFUN64-NEXT:     FilenameSegment [
 MFUN64-NEXT:       Filename: d:\source.c
@@ -211,6 +215,7 @@ MFUN64-NEXT:     ]
 MFUN64-NEXT:   ]
 MFUN64-NEXT:   FunctionLineTable [
 MFUN64-NEXT:     FunctionName: y
+MFUN64-NEXT:     Flags: 0x0
 MFUN64-NEXT:     CodeSize: 0xE
 MFUN64-NEXT:     FilenameSegment [
 MFUN64-NEXT:       Filename: d:\source.c
@@ -221,6 +226,7 @@ MFUN64-NEXT:     ]
 MFUN64-NEXT:   ]
 MFUN64-NEXT:   FunctionLineTable [
 MFUN64-NEXT:     FunctionName: f
+MFUN64-NEXT:     Flags: 0x0
 MFUN64-NEXT:     CodeSize: 0x18
 MFUN64-NEXT:     FilenameSegment [
 MFUN64-NEXT:       Filename: d:\source.c
@@ -296,6 +302,7 @@ MFILE32-NEXT:     PayloadSize: 0x8
 MFILE32:        ]
 MFILE32-NEXT:   FunctionLineTable [
 MFILE32-NEXT:     FunctionName: _f
+MFILE32-NEXT:     Flags: 0x0
 MFILE32-NEXT:     CodeSize: 0x14
 MFILE32-NEXT:     FilenameSegment [
 MFILE32-NEXT:       Filename: d:\input.c
@@ -352,6 +359,7 @@ MFILE64-NEXT:     PayloadSize: 0x8
 MFILE64:        ]
 MFILE64-NEXT:   FunctionLineTable [
 MFILE64-NEXT:     FunctionName: f
+MFILE64-NEXT:     Flags: 0x0
 MFILE64-NEXT:     CodeSize: 0x18
 MFILE64-NEXT:     FilenameSegment [
 MFILE64-NEXT:       Filename: d:\input.c
@@ -399,6 +407,7 @@ MCOMDAT-NEXT:   CodeSize: 0x7
 MCOMDAT-NEXT: }
 MCOMDAT:      FunctionLineTable [
 MCOMDAT-NEXT:   FunctionName: ?f@@YAHXZ
+MCOMDAT-NEXT:   Flags: 0x0
 MCOMDAT-NEXT:   CodeSize: 0x7
 MCOMDAT-NEXT:   FilenameSegment [
 MCOMDAT-NEXT:     Filename: c:\src\test.cc
@@ -414,6 +423,7 @@ MCOMDAT-NEXT:   CodeSize: 0x7
 MCOMDAT-NEXT: }
 MCOMDAT:      FunctionLineTable [
 MCOMDAT-NEXT:   FunctionName: ?g@@YAHXZ
+MCOMDAT-NEXT:   Flags: 0x0
 MCOMDAT-NEXT:   CodeSize: 0x7
 MCOMDAT-NEXT:   FilenameSegment [
 MCOMDAT-NEXT:     Filename: c:\src\test.cc
diff --git a/test/tools/llvm-readobj/mips-plt.test b/test/tools/llvm-readobj/mips-plt.test
new file mode 100644
index 000000000000..ab0824b0be68
--- /dev/null
+++ b/test/tools/llvm-readobj/mips-plt.test
@@ -0,0 +1,34 @@
+RUN: llvm-readobj -mips-plt-got %p/Inputs/got-plt.exe.elf-mipsel | FileCheck %s
+
+CHECK:      PLT GOT {
+CHECK-NEXT:   Reserved entries [
+CHECK-NEXT:     Entry {
+CHECK-NEXT:       Address: 0x410814
+CHECK-NEXT:       Initial: 0x0
+CHECK-NEXT:       Purpose: PLT lazy resolver
+CHECK-NEXT:     }
+CHECK-NEXT:     Entry {
+CHECK-NEXT:       Address: 0x410818
+CHECK-NEXT:       Initial: 0x0
+CHECK-NEXT:       Purpose: Module pointer
+CHECK-NEXT:     }
+CHECK-NEXT:   ]
+CHECK-NEXT:   Entries [
+CHECK-NEXT:     Entry {
+CHECK-NEXT:       Address: 0x41081C
+CHECK-NEXT:       Initial: 0x4007C0
+CHECK-NEXT:       Value: 0x0
+CHECK-NEXT:       Type: Function (0x2)
+CHECK-NEXT:       Section: Undefined (0x0)
+CHECK-NEXT:       Name: puts@GLIBC_2.0 (71)
+CHECK-NEXT:     }
+CHECK-NEXT:     Entry {
+CHECK-NEXT:       Address: 0x410820
+CHECK-NEXT:       Initial: 0x4007C0
+CHECK-NEXT:       Value: 0x0
+CHECK-NEXT:       Type: Function (0x2)
+CHECK-NEXT:       Section: Undefined (0x0)
+CHECK-NEXT:       Name: __libc_start_main@GLIBC_2.0 (53)
+CHECK-NEXT:     }
+CHECK-NEXT:   ]
+CHECK-NEXT: }
diff --git a/tools/dsymutil/DebugMap.cpp b/tools/dsymutil/DebugMap.cpp
index cc7c0dc778b3..e5cc87b3f318 100644
--- a/tools/dsymutil/DebugMap.cpp
+++ b/tools/dsymutil/DebugMap.cpp
@@ -216,9 +216,7 @@ MappingTraits<dsymutil::DebugMapObject>::YamlDMO::denormalize(IO &IO) {
     // during the test, we can't hardcode the symbols addresses, so
     // look them up here and rewrite them.
     for (const auto &Sym : ErrOrObjectFile->symbols()) {
-      uint64_t Address;
-      if (Sym.getAddress(Address))
-        continue;
+      uint64_t Address = Sym.getValue();
       ErrorOr<StringRef> Name = Sym.getName();
       if (!Name)
         continue;
diff --git a/tools/dsymutil/MachODebugMapParser.cpp b/tools/dsymutil/MachODebugMapParser.cpp
index c58545aec999..6c9fa9b51325 100644
--- a/tools/dsymutil/MachODebugMapParser.cpp
+++ b/tools/dsymutil/MachODebugMapParser.cpp
@@ -160,8 +160,6 @@ void MachODebugMapParser::handleStabSymbolTableEntry(uint32_t StringIndex,
     // symbol table to find its address as it might not be in the
     // debug map (for common symbols).
     Value = getMainBinarySymbolAddress(Name);
-    if (Value == UnknownAddress)
-      return;
     break;
   case MachO::N_FUN:
     // Functions are scopes in STABS. They have an end marker that
@@ -197,10 +195,7 @@ void MachODebugMapParser::loadCurrentObjectFileSymbols() {
   CurrentObjectAddresses.clear();
 
   for (auto Sym : CurrentObjectHolder.Get().symbols()) {
-
-    uint64_t Addr;
-    if (Sym.getAddress(Addr) || Addr == UnknownAddress)
-      continue;
+    uint64_t Addr = Sym.getValue();
     ErrorOr<StringRef> Name = Sym.getName();
     if (!Name)
       continue;
@@ -214,7 +209,7 @@ void MachODebugMapParser::loadCurrentObjectFileSymbols() {
 uint64_t MachODebugMapParser::getMainBinarySymbolAddress(StringRef Name) {
   auto Sym = MainBinarySymbolAddresses.find(Name);
   if (Sym == MainBinarySymbolAddresses.end())
-    return UnknownAddress;
+    return 0;
   return Sym->second;
 }
 
@@ -228,15 +223,14 @@ void MachODebugMapParser::loadMainBinarySymbols() {
     // Skip undefined and STAB entries.
     if ((Type & SymbolRef::ST_Debug) || (Type & SymbolRef::ST_Unknown))
       continue;
-    uint64_t Addr;
     // The only symbols of interest are the global variables. These
     // are the only ones that need to be queried because the address
     // of common data won't be described in the debug map. All other
     // addresses should be fetched for the debug map.
-    if (Sym.getAddress(Addr) || Addr == UnknownAddress ||
-        !(Sym.getFlags() & SymbolRef::SF_Global) || Sym.getSection(Section) ||
-        Section->isText())
+    if (!(Sym.getFlags() & SymbolRef::SF_Global) || Sym.getSection(Section) ||
+        Section == MainBinary.section_end() || Section->isText())
       continue;
+    uint64_t Addr = Sym.getValue();
     ErrorOr<StringRef> NameOrErr = Sym.getName();
     if (!NameOrErr)
       continue;
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 88e737160992..e33cd795d3ae 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -333,29 +333,44 @@ static int compileModule(char **argv, LLVMContext &Context) {
       OS = BOS.get();
     }
 
+    AnalysisID StartBeforeID = nullptr;
     AnalysisID StartAfterID = nullptr;
     AnalysisID StopAfterID = nullptr;
     const PassRegistry *PR = PassRegistry::getPassRegistry();
-    if (!StartAfter.empty()) {
-      const PassInfo *PI = PR->getPassInfo(StartAfter);
-      if (!PI) {
-        errs() << argv[0] << ": start-after pass is not registered.\n";
+    if (!RunPass.empty()) {
+      if (!StartAfter.empty() || !StopAfter.empty()) {
+        errs() << argv[0] << ": start-after and/or stop-after passes are "
+                             "redundant when run-pass is specified.\n";
         return 1;
       }
-      StartAfterID = PI->getTypeInfo();
-    }
-    if (!StopAfter.empty()) {
-      const PassInfo *PI = PR->getPassInfo(StopAfter);
+      const PassInfo *PI = PR->getPassInfo(RunPass);
       if (!PI) {
-        errs() << argv[0] << ": stop-after pass is not registered.\n";
+        errs() << argv[0] << ": run-pass pass is not registered.\n";
         return 1;
       }
-      StopAfterID = PI->getTypeInfo();
+      StopAfterID = StartBeforeID = PI->getTypeInfo();
+    } else {
+      if (!StartAfter.empty()) {
+        const PassInfo *PI = PR->getPassInfo(StartAfter);
+        if (!PI) {
+          errs() << argv[0] << ": start-after pass is not registered.\n";
+          return 1;
+        }
+        StartAfterID = PI->getTypeInfo();
+      }
+      if (!StopAfter.empty()) {
+        const PassInfo *PI = PR->getPassInfo(StopAfter);
+        if (!PI) {
+          errs() << argv[0] << ": stop-after pass is not registered.\n";
+          return 1;
+        }
+        StopAfterID = PI->getTypeInfo();
+      }
     }
 
     // Ask the target to add backend passes as necessary.
-    if (Target->addPassesToEmitFile(PM, *OS, FileType, NoVerify, StartAfterID,
-                                    StopAfterID, MIR.get())) {
+    if (Target->addPassesToEmitFile(PM, *OS, FileType, NoVerify, StartBeforeID,
+                                    StartAfterID, StopAfterID, MIR.get())) {
       errs() << argv[0] << ": target does not support generation of this"
              << " file type!\n";
       return 1;
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 0fd2df4f5aa9..2c9668c63b8c 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/LibDriver/LibDriver.h"
@@ -70,6 +71,16 @@ static cl::list<std::string>
 
 static cl::opt<bool> MRI("M", cl::desc(""));
 
+namespace {
+enum Format { Default, GNU, BSD };
+}
+
+static cl::opt<Format>
+    FormatOpt("format", cl::desc("Archive format to create"),
+              cl::values(clEnumValN(Default, "defalut", "default"),
+                         clEnumValN(GNU, "gnu", "gnu"),
+                         clEnumValN(BSD, "bsd", "bsd"), clEnumValEnd));
+
 std::string Options;
 
 // Provide additional help output explaining the operations and modifiers of
@@ -118,6 +129,7 @@ static bool OriginalDates = false; ///< 'o' modifier
 static bool OnlyUpdate = false;    ///< 'u' modifier
 static bool Verbose = false;       ///< 'v' modifier
 static bool Symtab = true;         ///< 's' modifier
+static bool Deterministic = true;  ///< 'D' and 'U' modifiers
 
 // Relative Positional Argument (for insert/move). This variable holds
 // the name of the archive member to which the 'a', 'b' or 'i' modifier
@@ -234,6 +246,12 @@ static ArchiveOperation parseCommandLine() {
       AddBefore = true;
       NumPositional++;
       break;
+    case 'D':
+      Deterministic = true;
+      break;
+    case 'U':
+      Deterministic = false;
+      break;
     default:
       cl::PrintHelpMessage();
     }
@@ -277,11 +295,13 @@ static ArchiveOperation parseCommandLine() {
 
 // Implements the 'p' operation. This function traverses the archive
 // looking for members that match the path list.
-static void doPrint(StringRef Name, object::Archive::child_iterator I) {
+static void doPrint(StringRef Name, const object::Archive::Child &C) {
   if (Verbose)
     outs() << "Printing " << Name << "\n";
 
-  StringRef Data = I->getBuffer();
+  ErrorOr<StringRef> DataOrErr = C.getBuffer();
+  failIfError(DataOrErr.getError());
+  StringRef Data = *DataOrErr;
   outs().write(Data.data(), Data.size());
 }
 
@@ -306,16 +326,16 @@ static void printMode(unsigned mode) {
 // the file names of each of the members. However, if verbose mode is requested
 // ('v' modifier) then the file type, permission mode, user, group, size, and
 // modification time are also printed.
-static void doDisplayTable(StringRef Name, object::Archive::child_iterator I) {
+static void doDisplayTable(StringRef Name, const object::Archive::Child &C) {
   if (Verbose) {
-    sys::fs::perms Mode = I->getAccessMode();
+    sys::fs::perms Mode = C.getAccessMode();
     printMode((Mode >> 6) & 007);
     printMode((Mode >> 3) & 007);
     printMode(Mode & 007);
-    outs() << ' ' << I->getUID();
-    outs() << '/' << I->getGID();
-    outs() << ' ' << format("%6llu", I->getSize());
-    outs() << ' ' << I->getLastModified().str();
+    outs() << ' ' << C.getUID();
+    outs() << '/' << C.getGID();
+    outs() << ' ' << format("%6llu", C.getSize());
+    outs() << ' ' << C.getLastModified().str();
     outs() << ' ';
   }
   outs() << Name << "\n";
@@ -323,9 +343,9 @@ static void doDisplayTable(StringRef Name, object::Archive::child_iterator I) {
 
 // Implement the 'x' operation. This function extracts files back to the file
 // system.
-static void doExtract(StringRef Name, object::Archive::child_iterator I) {
+static void doExtract(StringRef Name, const object::Archive::Child &C) {
   // Retain the original mode.
-  sys::fs::perms Mode = I->getAccessMode();
+  sys::fs::perms Mode = C.getAccessMode();
   SmallString<128> Storage = Name;
 
   int FD;
@@ -337,7 +357,7 @@ static void doExtract(StringRef Name, object::Archive::child_iterator I) {
     raw_fd_ostream file(FD, false);
 
     // Get the data and its length
-    StringRef Data = I->getBuffer();
+    StringRef Data = *C.getBuffer();
 
     // Write the data.
     file.write(Data.data(), Data.size());
@@ -347,7 +367,7 @@ static void doExtract(StringRef Name, object::Archive::child_iterator I) {
   // now.
   if (OriginalDates)
     failIfError(
-        sys::fs::setLastModificationAndAccessTime(FD, I->getLastModified()));
+        sys::fs::setLastModificationAndAccessTime(FD, C.getLastModified()));
 
   if (close(FD))
     fail("Could not close the file");
@@ -373,31 +393,43 @@ static bool shouldCreateArchive(ArchiveOperation Op) {
 
 static void performReadOperation(ArchiveOperation Operation,
                                  object::Archive *OldArchive) {
-  for (object::Archive::child_iterator I = OldArchive->child_begin(),
-                                       E = OldArchive->child_end();
-       I != E; ++I) {
-    ErrorOr<StringRef> NameOrErr = I->getName();
+  if (Operation == Extract && OldArchive->isThin()) {
+    errs() << "extracting from a thin archive is not supported\n";
+    std::exit(1);
+  }
+
+  bool Filter = !Members.empty();
+  for (const object::Archive::Child &C : OldArchive->children()) {
+    ErrorOr<StringRef> NameOrErr = C.getName();
     failIfError(NameOrErr.getError());
     StringRef Name = NameOrErr.get();
 
-    if (!Members.empty() &&
-        std::find(Members.begin(), Members.end(), Name) == Members.end())
-      continue;
+    if (Filter) {
+      auto I = std::find(Members.begin(), Members.end(), Name);
+      if (I == Members.end())
+        continue;
+      Members.erase(I);
+    }
 
     switch (Operation) {
     default:
       llvm_unreachable("Not a read operation");
     case Print:
-      doPrint(Name, I);
+      doPrint(Name, C);
       break;
     case DisplayTable:
-      doDisplayTable(Name, I);
+      doDisplayTable(Name, C);
       break;
     case Extract:
-      doExtract(Name, I);
+      doExtract(Name, C);
       break;
     }
   }
+  if (Members.empty())
+    return;
+  for (StringRef Name : Members)
+    errs() << Name << " was not found\n";
+  std::exit(1);
 }
 
 template <typename T>
@@ -525,7 +557,8 @@ computeNewArchiveMembers(ArchiveOperation Operation,
   assert(unsigned(InsertPos) <= Ret.size());
   Ret.insert(Ret.begin() + InsertPos, Moved.begin(), Moved.end());
 
-  Ret.insert(Ret.begin() + InsertPos, Members.size(), NewArchiveIterator());
+  Ret.insert(Ret.begin() + InsertPos, Members.size(),
+             NewArchiveIterator("", ""));
   int Pos = InsertPos;
   for (auto &Member : Members) {
     StringRef Name = sys::path::filename(Member);
@@ -539,15 +572,33 @@ computeNewArchiveMembers(ArchiveOperation Operation,
 static void
 performWriteOperation(ArchiveOperation Operation, object::Archive *OldArchive,
                       std::vector<NewArchiveIterator> *NewMembersP) {
+  object::Archive::Kind Kind;
+  switch (FormatOpt) {
+  case Default: {
+    Triple T(sys::getProcessTriple());
+    if (T.isOSDarwin())
+      Kind = object::Archive::K_BSD;
+    else
+      Kind = object::Archive::K_GNU;
+    break;
+  }
+  case GNU:
+    Kind = object::Archive::K_GNU;
+    break;
+  case BSD:
+    Kind = object::Archive::K_BSD;
+    break;
+  }
   if (NewMembersP) {
     std::pair<StringRef, std::error_code> Result =
-        writeArchive(ArchiveName, *NewMembersP, Symtab);
+        writeArchive(ArchiveName, *NewMembersP, Symtab, Kind, Deterministic);
     failIfError(Result.second, Result.first);
     return;
   }
   std::vector<NewArchiveIterator> NewMembers =
       computeNewArchiveMembers(Operation, OldArchive);
-  auto Result = writeArchive(ArchiveName, NewMembers, Symtab);
+  auto Result =
+      writeArchive(ArchiveName, NewMembers, Symtab, Kind, Deterministic);
   failIfError(Result.second, Result.first);
 }
 
diff --git a/tools/llvm-cxxdump/llvm-cxxdump.cpp b/tools/llvm-cxxdump/llvm-cxxdump.cpp
index c627a662a962..4e06be9e78b9 100644
--- a/tools/llvm-cxxdump/llvm-cxxdump.cpp
+++ b/tools/llvm-cxxdump/llvm-cxxdump.cpp
@@ -207,9 +207,10 @@ static void dumpCXXData(const ObjectFile *Obj) {
     StringRef SecContents;
     if (error(Sec.getContents(SecContents)))
       return;
-    uint64_t SymAddress;
-    if (error(Sym.getAddress(SymAddress)))
+    ErrorOr<uint64_t> SymAddressOrErr = Sym.getAddress();
+    if (error(SymAddressOrErr.getError()))
       return;
+    uint64_t SymAddress = *SymAddressOrErr;
     uint64_t SecAddress = Sec.getAddress();
     uint64_t SecSize = Sec.getSize();
     uint64_t SymOffset = SymAddress - SecAddress;
diff --git a/tools/llvm-jitlistener/CMakeLists.txt b/tools/llvm-jitlistener/CMakeLists.txt
index 68a4303acef0..61f8420a7ae5 100644
--- a/tools/llvm-jitlistener/CMakeLists.txt
+++ b/tools/llvm-jitlistener/CMakeLists.txt
@@ -6,7 +6,7 @@ include_directories( ${LLVM_INTEL_JITEVENTS_INCDIR} )
 set(LLVM_LINK_COMPONENTS
   asmparser
   bitreader
-  debuginfo
+  DebugInfoDWARF
   inteljitevents
   interpreter
   irreader
@@ -14,6 +14,10 @@ set(LLVM_LINK_COMPONENTS
   nativecodegen
   object
   selectiondag
+  Support
+  ExecutionEngine
+  RuntimeDyld
+  Core
   )
 
 add_llvm_tool(llvm-jitlistener
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index c88f37334564..e7ee3124ed73 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -180,67 +180,25 @@ struct NMSymbol {
   uint64_t Size;
   char TypeChar;
   StringRef Name;
-  DataRefImpl Symb;
+  BasicSymbolRef Sym;
 };
 }
 
 static bool compareSymbolAddress(const NMSymbol &A, const NMSymbol &B) {
-  if (!ReverseSort) {
-    if (A.Address < B.Address)
-      return true;
-    if (A.Address == B.Address && A.Name < B.Name)
-      return true;
-    if (A.Address == B.Address && A.Name == B.Name && A.Size < B.Size)
-      return true;
-    return false;
-  }
-
-  if (A.Address > B.Address)
-    return true;
-  if (A.Address == B.Address && A.Name > B.Name)
-    return true;
-  if (A.Address == B.Address && A.Name == B.Name && A.Size > B.Size)
-    return true;
-  return false;
+  bool ADefined = !(A.Sym.getFlags() & SymbolRef::SF_Undefined);
+  bool BDefined = !(B.Sym.getFlags() & SymbolRef::SF_Undefined);
+  return std::make_tuple(ADefined, A.Address, A.Name, A.Size) <
+         std::make_tuple(BDefined, B.Address, B.Name, B.Size);
 }
 
 static bool compareSymbolSize(const NMSymbol &A, const NMSymbol &B) {
-  if (!ReverseSort) {
-    if (A.Size < B.Size)
-      return true;
-    if (A.Size == B.Size && A.Name < B.Name)
-      return true;
-    if (A.Size == B.Size && A.Name == B.Name && A.Address < B.Address)
-      return true;
-    return false;
-  }
-
-  if (A.Size > B.Size)
-    return true;
-  if (A.Size == B.Size && A.Name > B.Name)
-    return true;
-  if (A.Size == B.Size && A.Name == B.Name && A.Address > B.Address)
-    return true;
-  return false;
+  return std::make_tuple(A.Size, A.Name, A.Address) <
+         std::make_tuple(B.Size, B.Name, B.Address);
 }
 
 static bool compareSymbolName(const NMSymbol &A, const NMSymbol &B) {
-  if (!ReverseSort) {
-    if (A.Name < B.Name)
-      return true;
-    if (A.Name == B.Name && A.Size < B.Size)
-      return true;
-    if (A.Name == B.Name && A.Size == B.Size && A.Address < B.Address)
-      return true;
-    return false;
-  }
-  if (A.Name > B.Name)
-    return true;
-  if (A.Name == B.Name && A.Size > B.Size)
-    return true;
-  if (A.Name == B.Name && A.Size == B.Size && A.Address > B.Address)
-    return true;
-  return false;
+  return std::make_tuple(A.Name, A.Size, A.Address) <
+         std::make_tuple(B.Name, B.Size, B.Address);
 }
 
 static char isSymbolList64Bit(SymbolicFile &Obj) {
@@ -274,11 +232,12 @@ static void darwinPrintSymbol(MachOObjectFile *MachO, SymbolListT::iterator I,
   uint16_t NDesc;
   uint32_t NStrx;
   uint64_t NValue;
+  DataRefImpl SymDRI = I->Sym.getRawDataRefImpl();
   if (MachO->is64Bit()) {
     H_64 = MachO->MachOObjectFile::getHeader64();
     Filetype = H_64.filetype;
     Flags = H_64.flags;
-    STE_64 = MachO->getSymbol64TableEntry(I->Symb);
+    STE_64 = MachO->getSymbol64TableEntry(SymDRI);
     NType = STE_64.n_type;
     NSect = STE_64.n_sect;
     NDesc = STE_64.n_desc;
@@ -288,7 +247,7 @@ static void darwinPrintSymbol(MachOObjectFile *MachO, SymbolListT::iterator I,
     H = MachO->MachOObjectFile::getHeader();
     Filetype = H.filetype;
     Flags = H.flags;
-    STE = MachO->getSymbolTableEntry(I->Symb);
+    STE = MachO->getSymbolTableEntry(SymDRI);
     NType = STE.n_type;
     NSect = STE.n_sect;
     NDesc = STE.n_desc;
@@ -356,7 +315,7 @@ static void darwinPrintSymbol(MachOObjectFile *MachO, SymbolListT::iterator I,
     break;
   case MachO::N_SECT: {
     section_iterator Sec = MachO->section_end();
-    MachO->getSymbolSection(I->Symb, Sec);
+    MachO->getSymbolSection(I->Sym.getRawDataRefImpl(), Sec);
     DataRefImpl Ref = Sec->getRawDataRefImpl();
     StringRef SectionName;
     MachO->getSectionName(Ref, SectionName);
@@ -415,7 +374,7 @@ static void darwinPrintSymbol(MachOObjectFile *MachO, SymbolListT::iterator I,
   if ((NType & MachO::N_TYPE) == MachO::N_INDR) {
     outs() << I->Name << " (for ";
     StringRef IndirectName;
-    if (MachO->getIndirectName(I->Symb, IndirectName))
+    if (MachO->getIndirectName(I->Sym.getRawDataRefImpl(), IndirectName))
       outs() << "?)";
     else
       outs() << IndirectName << ")";
@@ -498,13 +457,14 @@ static void darwinPrintStab(MachOObjectFile *MachO, SymbolListT::iterator I) {
   uint8_t NType;
   uint8_t NSect;
   uint16_t NDesc;
+  DataRefImpl SymDRI = I->Sym.getRawDataRefImpl();
   if (MachO->is64Bit()) {
-    STE_64 = MachO->getSymbol64TableEntry(I->Symb);
+    STE_64 = MachO->getSymbol64TableEntry(SymDRI);
     NType = STE_64.n_type;
     NSect = STE_64.n_sect;
     NDesc = STE_64.n_desc;
   } else {
-    STE = MachO->getSymbolTableEntry(I->Symb);
+    STE = MachO->getSymbolTableEntry(SymDRI);
     NType = STE.n_type;
     NSect = STE.n_sect;
     NDesc = STE.n_desc;
@@ -526,12 +486,17 @@ static void sortAndPrintSymbolList(SymbolicFile &Obj, bool printName,
                                    std::string ArchiveName,
                                    std::string ArchitectureName) {
   if (!NoSort) {
+    std::function<bool(const NMSymbol &, const NMSymbol &)> Cmp;
     if (NumericSort)
-      std::sort(SymbolList.begin(), SymbolList.end(), compareSymbolAddress);
+      Cmp = compareSymbolAddress;
     else if (SizeSort)
-      std::sort(SymbolList.begin(), SymbolList.end(), compareSymbolSize);
+      Cmp = compareSymbolSize;
     else
-      std::sort(SymbolList.begin(), SymbolList.end(), compareSymbolName);
+      Cmp = compareSymbolName;
+
+    if (ReverseSort)
+      Cmp = [=](const NMSymbol &A, const NMSymbol &B) { return Cmp(B, A); };
+    std::sort(SymbolList.begin(), SymbolList.end(), Cmp);
   }
 
   if (!PrintFileName) {
@@ -557,9 +522,11 @@ static void sortAndPrintSymbolList(SymbolicFile &Obj, bool printName,
 
   for (SymbolListT::iterator I = SymbolList.begin(), E = SymbolList.end();
        I != E; ++I) {
-    if ((I->TypeChar != 'U') && UndefinedOnly)
+    uint32_t SymFlags = I->Sym.getFlags();
+    bool Undefined = SymFlags & SymbolRef::SF_Undefined;
+    if (!Undefined && UndefinedOnly)
       continue;
-    if ((I->TypeChar == 'U') && DefinedOnly)
+    if (Undefined && DefinedOnly)
       continue;
     if (SizeSort && !PrintAddress)
       continue;
@@ -578,12 +545,12 @@ static void sortAndPrintSymbolList(SymbolicFile &Obj, bool printName,
     char SymbolAddrStr[18] = "";
     char SymbolSizeStr[18] = "";
 
-    if (OutputFormat == sysv || I->Address == UnknownAddress)
+    if (OutputFormat == sysv || I->TypeChar == 'U')
       strcpy(SymbolAddrStr, printBlanks);
     if (OutputFormat == sysv)
       strcpy(SymbolSizeStr, printBlanks);
 
-    if (I->Address != UnknownAddress)
+    if (I->TypeChar != 'U')
       format(printFormat, I->Address)
           .print(SymbolAddrStr, sizeof(SymbolAddrStr));
     format(printFormat, I->Size).print(SymbolSizeStr, sizeof(SymbolSizeStr));
@@ -895,20 +862,23 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
       continue;
     NMSymbol S;
     S.Size = 0;
-    S.Address = UnknownAddress;
+    S.Address = 0;
     if (PrintSize) {
       if (isa<ELFObjectFileBase>(&Obj))
         S.Size = ELFSymbolRef(Sym).getSize();
     }
     if (PrintAddress && isa<ObjectFile>(Obj)) {
-      if (error(SymbolRef(Sym).getAddress(S.Address)))
+      SymbolRef SymRef(Sym);
+      ErrorOr<uint64_t> AddressOrErr = SymRef.getAddress();
+      if (error(AddressOrErr.getError()))
         break;
+      S.Address = *AddressOrErr;
     }
     S.TypeChar = getNMTypeChar(Obj, Sym);
     if (error(Sym.printName(OS)))
       break;
     OS << '\0';
-    S.Symb = Sym.getRawDataRefImpl();
+    S.Sym = Sym;
     SymbolList.push_back(S);
   }
 
diff --git a/tools/llvm-objdump/COFFDump.cpp b/tools/llvm-objdump/COFFDump.cpp
index 58bdddfa9918..8b94a50ea28b 100644
--- a/tools/llvm-objdump/COFFDump.cpp
+++ b/tools/llvm-objdump/COFFDump.cpp
@@ -161,8 +161,10 @@ static std::error_code
 resolveSectionAndAddress(const COFFObjectFile *Obj, const SymbolRef &Sym,
                          const coff_section *&ResolvedSection,
                          uint64_t &ResolvedAddr) {
-  if (std::error_code EC = Sym.getAddress(ResolvedAddr))
+  ErrorOr<uint64_t> ResolvedAddrOrErr = Sym.getAddress();
+  if (std::error_code EC = ResolvedAddrOrErr.getError())
     return EC;
+  ResolvedAddr = *ResolvedAddrOrErr;
   section_iterator iter(Obj->section_begin());
   if (std::error_code EC = Sym.getSection(iter))
     return EC;
diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index 5263c33bf2dc..04c72f4856c8 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -102,9 +102,6 @@ cl::list<std::string>
                        cl::desc("Prints the specified segment,section for "
                                 "Mach-O objects (requires -macho)"));
 
-cl::opt<bool> llvm::Raw("raw",
-                        cl::desc("Have -section dump the raw binary contents"));
-
 cl::opt<bool>
     llvm::InfoPlist("info-plist",
                     cl::desc("Print the info plist section as strings for "
@@ -178,18 +175,8 @@ static const Target *GetTarget(const MachOObjectFile *MachOObj,
 
 struct SymbolSorter {
   bool operator()(const SymbolRef &A, const SymbolRef &B) {
-    SymbolRef::Type AType = A.getType();
-    SymbolRef::Type BType = B.getType();
-
-    uint64_t AAddr, BAddr;
-    if (AType != SymbolRef::ST_Function)
-      AAddr = 0;
-    else
-      A.getAddress(AAddr);
-    if (BType != SymbolRef::ST_Function)
-      BAddr = 0;
-    else
-      B.getAddress(BAddr);
+    uint64_t AAddr = (A.getType() != SymbolRef::ST_Function) ? 0 : A.getValue();
+    uint64_t BAddr = (B.getType() != SymbolRef::ST_Function) ? 0 : B.getValue();
     return AAddr < BAddr;
   }
 };
@@ -592,8 +579,7 @@ static void CreateSymbolAddressMap(MachOObjectFile *O,
     SymbolRef::Type ST = Symbol.getType();
     if (ST == SymbolRef::ST_Function || ST == SymbolRef::ST_Data ||
         ST == SymbolRef::ST_Other) {
-      uint64_t Address;
-      Symbol.getAddress(Address);
+      uint64_t Address = Symbol.getValue();
       ErrorOr<StringRef> SymNameOrErr = Symbol.getName();
       if (std::error_code EC = SymNameOrErr.getError())
         report_fatal_error(EC.message());
@@ -1057,11 +1043,6 @@ static void DumpSectionContents(StringRef Filename, MachOObjectFile *O,
         uint32_t sect_size = BytesStr.size();
         uint64_t sect_addr = Section.getAddress();
 
-        if (Raw) {
-          outs().write(BytesStr.data(), BytesStr.size());
-          continue;
-        }
-
         outs() << "Contents of (" << SegName << "," << SectName
                << ") section\n";
 
@@ -1190,8 +1171,7 @@ static void ProcessMachO(StringRef Filename, MachOObjectFile *MachOOF,
   // UniversalHeaders or ArchiveHeaders.
   if (Disassemble || PrivateHeaders || ExportsTrie || Rebase || Bind ||
       LazyBind || WeakBind || IndirectSymbols || DataInCode || LinkOptHints ||
-      DylibsUsed || DylibId || ObjcMetaData ||
-      (DumpSections.size() != 0 && !Raw)) {
+      DylibsUsed || DylibId || ObjcMetaData || (DumpSections.size() != 0)) {
     outs() << Filename;
     if (!ArchiveMemberName.empty())
       outs() << '(' << ArchiveMemberName << ')';
@@ -2424,7 +2404,7 @@ static const char *get_pointer_32(uint32_t Address, uint32_t &offset,
 // symbol is passed, look up that address in the info's AddrMap.
 static const char *get_symbol_64(uint32_t sect_offset, SectionRef S,
                                  DisassembleInfo *info, uint64_t &n_value,
-                                 uint64_t ReferenceValue = UnknownAddress) {
+                                 uint64_t ReferenceValue = 0) {
   n_value = 0;
   if (!info->verbose)
     return nullptr;
@@ -2456,9 +2436,7 @@ static const char *get_symbol_64(uint32_t sect_offset, SectionRef S,
   // and return its name.
   const char *SymbolName = nullptr;
   if (reloc_found && isExtern) {
-    Symbol.getAddress(n_value);
-    if (n_value == UnknownAddress)
-      n_value = 0;
+    n_value = Symbol.getValue();
     ErrorOr<StringRef> NameOrError = Symbol.getName();
     if (std::error_code EC = NameOrError.getError())
       report_fatal_error(EC.message());
@@ -2480,8 +2458,7 @@ static const char *get_symbol_64(uint32_t sect_offset, SectionRef S,
 
   // We did not find an external relocation entry so look up the ReferenceValue
   // as an address of a symbol and if found return that symbol's name.
-  if (ReferenceValue != UnknownAddress)
-    SymbolName = GuessSymbolName(ReferenceValue, info->AddrMap);
+  SymbolName = GuessSymbolName(ReferenceValue, info->AddrMap);
 
   return SymbolName;
 }
@@ -5640,7 +5617,7 @@ static const char *GuessLiteralPointer(uint64_t ReferenceValue,
     if (info->O->getAnyRelocationPCRel(RE)) {
       unsigned Type = info->O->getAnyRelocationType(RE);
       if (Type == MachO::X86_64_RELOC_SIGNED) {
-        Symbol.getAddress(ReferenceValue);
+        ReferenceValue = Symbol.getValue();
       }
     }
   }
@@ -6131,8 +6108,7 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
       SymbolRef::Type ST = Symbol.getType();
       if (ST == SymbolRef::ST_Function || ST == SymbolRef::ST_Data ||
           ST == SymbolRef::ST_Other) {
-        uint64_t Address;
-        Symbol.getAddress(Address);
+        uint64_t Address = Symbol.getValue();
         ErrorOr<StringRef> SymNameOrErr = Symbol.getName();
         if (std::error_code EC = SymNameOrErr.getError())
           report_fatal_error(EC.message());
@@ -6194,9 +6170,8 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
         continue;
 
       // Start at the address of the symbol relative to the section's address.
-      uint64_t Start = 0;
+      uint64_t Start = Symbols[SymIdx].getValue();
       uint64_t SectionAddress = Sections[SectIdx].getAddress();
-      Symbols[SymIdx].getAddress(Start);
       Start -= SectionAddress;
 
       // Stop disassembling either at the beginning of the next symbol or at
@@ -6209,7 +6184,7 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
         if (NextSymType == SymbolRef::ST_Function) {
           containsNextSym =
               Sections[SectIdx].containsSymbol(Symbols[NextSymIdx]);
-          Symbols[NextSymIdx].getAddress(NextSym);
+          NextSym = Symbols[NextSymIdx].getValue();
           NextSym -= SectionAddress;
           break;
         }
@@ -6815,8 +6790,7 @@ void llvm::printMachOUnwindInfo(const MachOObjectFile *Obj) {
     if (Section == Obj->section_end())
       continue;
 
-    uint64_t Addr;
-    SymRef.getAddress(Addr);
+    uint64_t Addr = SymRef.getValue();
     Symbols.insert(std::make_pair(Addr, SymRef));
   }
 
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 786981854609..275eb9c6a454 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -96,6 +96,10 @@ llvm::LazyBind("lazy-bind", cl::desc("Display mach-o lazy binding info"));
 cl::opt<bool>
 llvm::WeakBind("weak-bind", cl::desc("Display mach-o weak binding info"));
 
+cl::opt<bool>
+llvm::RawClangAST("raw-clang-ast",
+    cl::desc("Dump the raw binary contents of the clang AST section"));
+
 static cl::opt<bool>
 MachOOpt("macho", cl::desc("Use MachO specific object file parser"));
 static cl::alias
@@ -212,9 +216,7 @@ static const Target *getTarget(const ObjectFile *Obj = nullptr) {
 }
 
 bool llvm::RelocAddressLess(RelocationRef a, RelocationRef b) {
-  uint64_t a_addr = a.getOffset();
-  uint64_t b_addr = b.getOffset();
-  return a_addr < b_addr;
+  return a.getOffset() < b.getOffset();
 }
 
 namespace {
@@ -455,13 +457,12 @@ static void printRelocationTargetName(const MachOObjectFile *O,
 
     for (const SymbolRef &Symbol : O->symbols()) {
       std::error_code ec;
-      uint64_t Addr;
-      ErrorOr<StringRef> Name = Symbol.getName();
-
-      if ((ec = Symbol.getAddress(Addr)))
+      ErrorOr<uint64_t> Addr = Symbol.getAddress();
+      if ((ec = Addr.getError()))
         report_fatal_error(ec.message());
-      if (Addr != Val)
+      if (*Addr != Val)
         continue;
+      ErrorOr<StringRef> Name = Symbol.getName();
       if (std::error_code EC = Name.getError())
         report_fatal_error(EC.message());
       fmt << *Name;
@@ -811,6 +812,30 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
       SectionRelocMap[*Sec2].push_back(Section);
   }
 
+  // Create a mapping from virtual address to symbol name.  This is used to
+  // pretty print the target of a call.
+  std::vector<std::pair<uint64_t, StringRef>> AllSymbols;
+  if (MIA) {
+    for (const SymbolRef &Symbol : Obj->symbols()) {
+      if (Symbol.getType() != SymbolRef::ST_Function)
+        continue;
+
+      ErrorOr<uint64_t> AddressOrErr = Symbol.getAddress();
+      if (error(AddressOrErr.getError()))
+        break;
+      uint64_t Address = *AddressOrErr;
+
+      ErrorOr<StringRef> Name = Symbol.getName();
+      if (error(Name.getError()))
+        break;
+      if (Name->empty())
+        continue;
+      AllSymbols.push_back(std::make_pair(Address, *Name));
+    }
+
+    array_pod_sort(AllSymbols.begin(), AllSymbols.end());
+  }
+
   for (const SectionRef &Section : Obj->sections()) {
     if (!Section.isText() || Section.isVirtual())
       continue;
@@ -824,11 +849,10 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
     std::vector<std::pair<uint64_t, StringRef>> Symbols;
     for (const SymbolRef &Symbol : Obj->symbols()) {
       if (Section.containsSymbol(Symbol)) {
-        uint64_t Address;
-        if (error(Symbol.getAddress(Address)))
+        ErrorOr<uint64_t> AddressOrErr = Symbol.getAddress();
+        if (error(AddressOrErr.getError()))
           break;
-        if (Address == UnknownAddress)
-          continue;
+        uint64_t Address = *AddressOrErr;
         Address -= SectionAddr;
         if (Address >= SectSize)
           continue;
@@ -916,6 +940,29 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
                         SectionAddr + Index, outs(), "", *STI);
           outs() << CommentStream.str();
           Comments.clear();
+          if (MIA && (MIA->isCall(Inst) || MIA->isUnconditionalBranch(Inst) ||
+                      MIA->isConditionalBranch(Inst))) {
+            uint64_t Target;
+            if (MIA->evaluateBranch(Inst, SectionAddr + Index, Size, Target)) {
+              auto TargetSym = std::upper_bound(
+                  AllSymbols.begin(), AllSymbols.end(), Target,
+                  [](uint64_t LHS, const std::pair<uint64_t, StringRef> &RHS) {
+                    return LHS < RHS.first;
+                  });
+              if (TargetSym != AllSymbols.begin())
+                --TargetSym;
+              else
+                TargetSym = AllSymbols.end();
+
+              if (TargetSym != AllSymbols.end()) {
+                outs() << " <" << TargetSym->second;
+                uint64_t Disp = Target - TargetSym->first;
+                if (Disp)
+                  outs() << '+' << utohexstr(Disp);
+                outs() << '>';
+              }
+            }
+          }
           outs() << "\n";
         } else {
           errs() << ToolName << ": warning: invalid instruction encoding\n";
@@ -1113,12 +1160,13 @@ void llvm::PrintSymbolTable(const ObjectFile *o) {
     return;
   }
   for (const SymbolRef &Symbol : o->symbols()) {
-    uint64_t Address;
+    ErrorOr<uint64_t> AddressOrError = Symbol.getAddress();
+    if (error(AddressOrError.getError()))
+      continue;
+    uint64_t Address = *AddressOrError;
     SymbolRef::Type Type = Symbol.getType();
     uint32_t Flags = Symbol.getFlags();
     section_iterator Section = o->section_end();
-    if (error(Symbol.getAddress(Address)))
-      continue;
     if (error(Symbol.getSection(Section)))
       continue;
     StringRef Name;
@@ -1137,11 +1185,6 @@ void llvm::PrintSymbolTable(const ObjectFile *o) {
     bool Common = Flags & SymbolRef::SF_Common;
     bool Hidden = Flags & SymbolRef::SF_Hidden;
 
-    if (Common)
-      Address = Symbol.getCommonSize();
-
-    if (Address == UnknownAddress)
-      Address = 0;
     char GlobLoc = ' ';
     if (Type != SymbolRef::ST_Unknown)
       GlobLoc = Global ? 'g' : 'l';
@@ -1269,6 +1312,43 @@ void llvm::printWeakBindTable(const ObjectFile *o) {
   }
 }
 
+/// Dump the raw contents of the __clangast section so the output can be piped
+/// into llvm-bcanalyzer.
+void llvm::printRawClangAST(const ObjectFile *Obj) {
+  if (outs().is_displayed()) {
+    errs() << "The -raw-clang-ast option will dump the raw binary contents of "
+              "the clang ast section.\n"
+              "Please redirect the output to a file or another program such as "
+              "llvm-bcanalyzer.\n";
+    return;
+  }
+
+  StringRef ClangASTSectionName("__clangast");
+  if (isa<COFFObjectFile>(Obj)) {
+    ClangASTSectionName = "clangast";
+  }
+
+  Optional<object::SectionRef> ClangASTSection;
+  for (auto Sec : Obj->sections()) {
+    StringRef Name;
+    Sec.getName(Name);
+    if (Name == ClangASTSectionName) {
+      ClangASTSection = Sec;
+      break;
+    }
+  }
+  if (!ClangASTSection)
+    return;
+
+  StringRef ClangASTContents;
+  if (error(ClangASTSection.getValue().getContents(ClangASTContents))) {
+    errs() << "Could not read the " << ClangASTSectionName << " section!\n";
+    return;
+  }
+
+  outs().write(ClangASTContents.data(), ClangASTContents.size());
+}
+
 static void printFaultMaps(const ObjectFile *Obj) {
   const char *FaultMapSectionName = nullptr;
 
@@ -1323,9 +1403,12 @@ static void printPrivateFileHeader(const ObjectFile *o) {
 }
 
 static void DumpObject(const ObjectFile *o) {
-  outs() << '\n';
-  outs() << o->getFileName()
-         << ":\tfile format " << o->getFileFormatName() << "\n\n";
+  // Avoid other output when using a raw option.
+  if (!RawClangAST) {
+    outs() << '\n';
+    outs() << o->getFileName()
+           << ":\tfile format " << o->getFileFormatName() << "\n\n";
+  }
 
   if (Disassemble)
     DisassembleObject(o, Relocations);
@@ -1351,6 +1434,8 @@ static void DumpObject(const ObjectFile *o) {
     printLazyBindTable(o);
   if (WeakBind)
     printWeakBindTable(o);
+  if (RawClangAST)
+    printRawClangAST(o);
   if (PrintFaultMaps)
     printFaultMaps(o);
 }
@@ -1441,6 +1526,7 @@ int main(int argc, char **argv) {
       && !Bind
       && !LazyBind
       && !WeakBind
+      && !RawClangAST
       && !(UniversalHeaders && MachOOpt)
       && !(ArchiveHeaders && MachOOpt)
       && !(IndirectSymbols && MachOOpt)
diff --git a/tools/llvm-objdump/llvm-objdump.h b/tools/llvm-objdump/llvm-objdump.h
index b4d34f4033bc..eb10d8344f71 100644
--- a/tools/llvm-objdump/llvm-objdump.h
+++ b/tools/llvm-objdump/llvm-objdump.h
@@ -26,7 +26,6 @@ extern cl::opt<std::string> ArchName;
 extern cl::opt<std::string> MCPU;
 extern cl::list<std::string> MAttrs;
 extern cl::list<std::string> DumpSections;
-extern cl::opt<bool> Raw;
 extern cl::opt<bool> Disassemble;
 extern cl::opt<bool> NoShowRawInsn;
 extern cl::opt<bool> PrivateHeaders;
@@ -35,6 +34,7 @@ extern cl::opt<bool> Rebase;
 extern cl::opt<bool> Bind;
 extern cl::opt<bool> LazyBind;
 extern cl::opt<bool> WeakBind;
+extern cl::opt<bool> RawClangAST;
 extern cl::opt<bool> UniversalHeaders;
 extern cl::opt<bool> ArchiveHeaders;
 extern cl::opt<bool> IndirectSymbols;
@@ -72,6 +72,7 @@ void printRebaseTable(const object::ObjectFile *o);
 void printBindTable(const object::ObjectFile *o);
 void printLazyBindTable(const object::ObjectFile *o);
 void printWeakBindTable(const object::ObjectFile *o);
+void printRawClangAST(const object::ObjectFile *o);
 void PrintRelocations(const object::ObjectFile *o);
 void PrintSectionHeaders(const object::ObjectFile *o);
 void PrintSectionContents(const object::ObjectFile *o);
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.cpp b/tools/llvm-readobj/ARMWinEHPrinter.cpp
index a1ea79f3688e..bf5ff8e1d031 100644
--- a/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -201,10 +201,10 @@ ErrorOr<object::SymbolRef> Decoder::getSymbol(const COFFObjectFile &COFF,
     if (FunctionOnly && Symbol.getType() != SymbolRef::ST_Function)
       continue;
 
-    uint64_t Address;
-    if (std::error_code EC = Symbol.getAddress(Address))
+    ErrorOr<uint64_t> Address = Symbol.getAddress();
+    if (std::error_code EC = Address.getError())
       return EC;
-    if (Address == VA)
+    if (*Address == VA)
       return Symbol;
   }
   return readobj_error::unknown_symbol;
@@ -605,7 +605,10 @@ bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
     if (std::error_code EC = FunctionNameOrErr.getError())
       report_fatal_error(EC.message());
     FunctionName = *FunctionNameOrErr;
-    Function->getAddress(FunctionAddress);
+    ErrorOr<uint64_t> FunctionAddressOrErr = Function->getAddress();
+    if (std::error_code EC = FunctionAddressOrErr.getError())
+      report_fatal_error(EC.message());
+    FunctionAddress = *FunctionAddressOrErr;
   } else {
     const pe32_header *PEHeader;
     if (COFF.getPE32Header(PEHeader))
@@ -620,8 +623,10 @@ bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
     if (std::error_code EC = Name.getError())
       report_fatal_error(EC.message());
 
-    uint64_t Address;
-    XDataRecord->getAddress(Address);
+    ErrorOr<uint64_t> AddressOrErr = XDataRecord->getAddress();
+    if (std::error_code EC = AddressOrErr.getError())
+      report_fatal_error(EC.message());
+    uint64_t Address = *AddressOrErr;
 
     SW.printString("ExceptionRecord", formatSymbol(*Name, Address));
 
@@ -666,7 +671,8 @@ bool Decoder::dumpPackedEntry(const object::COFFObjectFile &COFF,
     if (std::error_code EC = FunctionNameOrErr.getError())
       report_fatal_error(EC.message());
     FunctionName = *FunctionNameOrErr;
-    Function->getAddress(FunctionAddress);
+    ErrorOr<uint64_t> FunctionAddressOrErr = Function->getAddress();
+    FunctionAddress = *FunctionAddressOrErr;
   } else {
     const pe32_header *PEHeader;
     if (COFF.getPE32Header(PEHeader))
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index f5effe292441..cf897d7cb484 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -48,7 +48,6 @@ public:
   COFFDumper(const llvm::object::COFFObjectFile *Obj, StreamWriter& Writer)
     : ObjDumper(Writer)
     , Obj(Obj) {
-    cacheRelocations();
   }
 
   void printFileHeaders() override;
@@ -92,6 +91,7 @@ private:
   typedef DenseMap<const coff_section*, std::vector<RelocationRef> > RelocMapTy;
 
   const llvm::object::COFFObjectFile *Obj;
+  bool RelocCached = false;
   RelocMapTy RelocMap;
   StringRef CVFileIndexToStringOffsetTable;
   StringRef CVStringTable;
@@ -119,6 +119,7 @@ std::error_code createCOFFDumper(const object::ObjectFile *Obj,
 // symbol used for the relocation at the offset.
 std::error_code COFFDumper::resolveSymbol(const coff_section *Section,
                                           uint64_t Offset, SymbolRef &Sym) {
+  cacheRelocations();
   const auto &Relocations = RelocMap[Section];
   for (const auto &Relocation : Relocations) {
     uint64_t RelocationOffset = Relocation.getOffset();
@@ -339,6 +340,10 @@ static std::error_code getSymbolAuxData(const COFFObjectFile *Obj,
 }
 
 void COFFDumper::cacheRelocations() {
+  if (RelocCached)
+    return;
+  RelocCached = true;
+
   for (const SectionRef &S : Obj->sections()) {
     const coff_section *Section = Obj->getCOFFSection(S);
 
@@ -580,7 +585,11 @@ void COFFDumper::printCodeViewDebugInfo(const SectionRef &Section) {
     W.printString("FunctionName", Name);
 
     DataExtractor DE(FunctionLineTables[Name], true, 4);
-    uint32_t Offset = 8;  // Skip relocations.
+    uint32_t Offset = 6;  // Skip relocations.
+    uint16_t Flags = DE.getU16(&Offset);
+    W.printHex("Flags", Flags);
+    bool HasColumnInformation =
+        Flags & COFF::DEBUG_LINE_TABLES_HAVE_COLUMN_RECORDS;
     uint32_t FunctionSize = DE.getU32(&Offset);
     W.printHex("CodeSize", FunctionSize);
     while (DE.isValidOffset(Offset)) {
@@ -588,9 +597,12 @@ void COFFDumper::printCodeViewDebugInfo(const SectionRef &Section) {
       // in the line table.  The filename string is accessed using double
       // indirection to the string table subsection using the index subsection.
       uint32_t OffsetInIndex = DE.getU32(&Offset),
-               SegmentLength   = DE.getU32(&Offset),
+               SegmentLength = DE.getU32(&Offset),
                FullSegmentSize = DE.getU32(&Offset);
-      if (FullSegmentSize != 12 + 8 * SegmentLength) {
+
+      if (FullSegmentSize !=
+          12 + 8 * SegmentLength +
+              (HasColumnInformation ? 4 * SegmentLength : 0)) {
         error(object_error::parse_failed);
         return;
       }
@@ -631,6 +643,15 @@ void COFFDumper::printCodeViewDebugInfo(const SectionRef &Section) {
         format("+0x%X", PC).snprint(Buffer, 32);
         W.printNumber(Buffer, LineNumber);
       }
+      if (HasColumnInformation) {
+        for (unsigned J = 0; J != SegmentLength && DE.isValidOffset(Offset);
+             ++J) {
+          uint16_t ColStart = DE.getU16(&Offset);
+          W.printNumber("ColStart", ColStart);
+          uint16_t ColEnd = DE.getU16(&Offset);
+          W.printNumber("ColEnd", ColEnd);
+        }
+      }
     }
   }
 }
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index a4b25efeb9ba..1cdf5529c080 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -56,6 +56,7 @@ public:
   void printDynamicTable() override;
   void printNeededLibraries() override;
   void printProgramHeaders() override;
+  void printHashTable() override;
 
   void printAttributes() override;
   void printMipsPLTGOT() override;
@@ -1119,6 +1120,18 @@ void ELFDumper<ELFT>::printProgramHeaders() {
   }
 }
 
+template <typename ELFT>
+void ELFDumper<ELFT>::printHashTable() {
+  DictScope D(W, "HashTable");
+  auto HT = Obj->getHashTable();
+  if (!HT)
+    return;
+  W.printNumber("Num Buckets", HT->nbucket);
+  W.printNumber("Num Chains", HT->nchain);
+  W.printList("Buckets", HT->buckets());
+  W.printList("Chains", HT->chains());
+}
+
 template <class ELFT>
 void ELFDumper<ELFT>::printAttributes() {
   W.startLine() << "Attributes not implemented.\n";
@@ -1162,9 +1175,10 @@ public:
   typedef typename ObjectFile::Elf_Shdr Elf_Shdr;
   typedef typename ObjectFile::Elf_Sym Elf_Sym;
 
-  MipsGOTParser(const ObjectFile *Obj, StreamWriter &W) : Obj(Obj), W(W) {}
+  MipsGOTParser(const ObjectFile *Obj, StreamWriter &W);
 
-  void parseGOT(const Elf_Shdr &GOTShdr);
+  void parseGOT();
+  void parsePLT();
 
 private:
   typedef typename ObjectFile::Elf_Addr GOTEntry;
@@ -1173,35 +1187,79 @@ private:
 
   const ObjectFile *Obj;
   StreamWriter &W;
+  llvm::Optional<uint64_t> DtPltGot;
+  llvm::Optional<uint64_t> DtLocalGotNum;
+  llvm::Optional<uint64_t> DtGotSym;
+  llvm::Optional<uint64_t> DtMipsPltGot;
+  llvm::Optional<uint64_t> DtJmpRel;
 
   std::size_t getGOTTotal(ArrayRef<uint8_t> GOT) const;
   GOTIter makeGOTIter(ArrayRef<uint8_t> GOT, std::size_t EntryNum);
 
-  bool getGOTTags(uint64_t &LocalGotNum, uint64_t &GotSym);
   void printGotEntry(uint64_t GotAddr, GOTIter BeginIt, GOTIter It);
   void printGlobalGotEntry(uint64_t GotAddr, GOTIter BeginIt, GOTIter It,
                            const Elf_Sym *Sym, bool IsDynamic);
+  void printPLTEntry(uint64_t PLTAddr, GOTIter BeginIt, GOTIter It,
+                     StringRef Purpose);
+  void printPLTEntry(uint64_t PLTAddr, GOTIter BeginIt, GOTIter It,
+                     const Elf_Sym *Sym);
 };
 }
 
 template <class ELFT>
-void MipsGOTParser<ELFT>::parseGOT(const Elf_Shdr &GOTShdr) {
+MipsGOTParser<ELFT>::MipsGOTParser(const ObjectFile *Obj, StreamWriter &W)
+    : Obj(Obj), W(W) {
+  for (const auto &Entry : Obj->dynamic_table()) {
+    switch (Entry.getTag()) {
+    case ELF::DT_PLTGOT:
+      DtPltGot = Entry.getVal();
+      break;
+    case ELF::DT_MIPS_LOCAL_GOTNO:
+      DtLocalGotNum = Entry.getVal();
+      break;
+    case ELF::DT_MIPS_GOTSYM:
+      DtGotSym = Entry.getVal();
+      break;
+    case ELF::DT_MIPS_PLTGOT:
+      DtMipsPltGot = Entry.getVal();
+      break;
+    case ELF::DT_JMPREL:
+      DtJmpRel = Entry.getVal();
+      break;
+    }
+  }
+}
+
+template <class ELFT> void MipsGOTParser<ELFT>::parseGOT() {
   // See "Global Offset Table" in Chapter 5 in the following document
   // for detailed GOT description.
   // ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
+  if (!DtPltGot) {
+    W.startLine() << "Cannot find PLTGOT dynamic table tag.\n";
+    return;
+  }
+  if (!DtLocalGotNum) {
+    W.startLine() << "Cannot find MIPS_LOCAL_GOTNO dynamic table tag.\n";
+    return;
+  }
+  if (!DtGotSym) {
+    W.startLine() << "Cannot find MIPS_GOTSYM dynamic table tag.\n";
+    return;
+  }
 
-  ErrorOr<ArrayRef<uint8_t>> GOT = Obj->getSectionContents(&GOTShdr);
-  if (!GOT) {
-    W.startLine() << "The .got section is empty.\n";
+  const Elf_Shdr *GOTShdr = findSectionByAddress(Obj, *DtPltGot);
+  if (!GOTShdr) {
+    W.startLine() << "There is no .got section in the file.\n";
     return;
   }
 
-  uint64_t DtLocalGotNum;
-  uint64_t DtGotSym;
-  if (!getGOTTags(DtLocalGotNum, DtGotSym))
+  ErrorOr<ArrayRef<uint8_t>> GOT = Obj->getSectionContents(GOTShdr);
+  if (!GOT) {
+    W.startLine() << "The .got section is empty.\n";
     return;
+  }
 
-  if (DtLocalGotNum > getGOTTotal(*GOT)) {
+  if (*DtLocalGotNum > getGOTTotal(*GOT)) {
     W.startLine() << "MIPS_LOCAL_GOTNO exceeds a number of GOT entries.\n";
     return;
   }
@@ -1210,37 +1268,37 @@ void MipsGOTParser<ELFT>::parseGOT(const Elf_Shdr &GOTShdr) {
   const Elf_Sym *DynSymEnd = Obj->dynamic_symbol_end();
   std::size_t DynSymTotal = std::size_t(std::distance(DynSymBegin, DynSymEnd));
 
-  if (DtGotSym > DynSymTotal) {
+  if (*DtGotSym > DynSymTotal) {
     W.startLine() << "MIPS_GOTSYM exceeds a number of dynamic symbols.\n";
     return;
   }
 
-  std::size_t GlobalGotNum = DynSymTotal - DtGotSym;
+  std::size_t GlobalGotNum = DynSymTotal - *DtGotSym;
 
-  if (DtLocalGotNum + GlobalGotNum > getGOTTotal(*GOT)) {
+  if (*DtLocalGotNum + GlobalGotNum > getGOTTotal(*GOT)) {
     W.startLine() << "Number of global GOT entries exceeds the size of GOT.\n";
     return;
   }
 
   GOTIter GotBegin = makeGOTIter(*GOT, 0);
-  GOTIter GotLocalEnd = makeGOTIter(*GOT, DtLocalGotNum);
+  GOTIter GotLocalEnd = makeGOTIter(*GOT, *DtLocalGotNum);
   GOTIter It = GotBegin;
 
   DictScope GS(W, "Primary GOT");
 
-  W.printHex("Canonical gp value", GOTShdr.sh_addr + 0x7ff0);
+  W.printHex("Canonical gp value", GOTShdr->sh_addr + 0x7ff0);
   {
     ListScope RS(W, "Reserved entries");
 
     {
       DictScope D(W, "Entry");
-      printGotEntry(GOTShdr.sh_addr, GotBegin, It++);
+      printGotEntry(GOTShdr->sh_addr, GotBegin, It++);
       W.printString("Purpose", StringRef("Lazy resolver"));
     }
 
     if (It != GotLocalEnd && (*It >> (sizeof(GOTEntry) * 8 - 1)) != 0) {
       DictScope D(W, "Entry");
-      printGotEntry(GOTShdr.sh_addr, GotBegin, It++);
+      printGotEntry(GOTShdr->sh_addr, GotBegin, It++);
       W.printString("Purpose", StringRef("Module pointer (GNU extension)"));
     }
   }
@@ -1248,24 +1306,88 @@ void MipsGOTParser<ELFT>::parseGOT(const Elf_Shdr &GOTShdr) {
     ListScope LS(W, "Local entries");
     for (; It != GotLocalEnd; ++It) {
       DictScope D(W, "Entry");
-      printGotEntry(GOTShdr.sh_addr, GotBegin, It);
+      printGotEntry(GOTShdr->sh_addr, GotBegin, It);
     }
   }
   {
     ListScope GS(W, "Global entries");
 
-    GOTIter GotGlobalEnd = makeGOTIter(*GOT, DtLocalGotNum + GlobalGotNum);
-    const Elf_Sym *GotDynSym = DynSymBegin + DtGotSym;
+    GOTIter GotGlobalEnd = makeGOTIter(*GOT, *DtLocalGotNum + GlobalGotNum);
+    const Elf_Sym *GotDynSym = DynSymBegin + *DtGotSym;
     for (; It != GotGlobalEnd; ++It) {
       DictScope D(W, "Entry");
-      printGlobalGotEntry(GOTShdr.sh_addr, GotBegin, It, GotDynSym++, true);
+      printGlobalGotEntry(GOTShdr->sh_addr, GotBegin, It, GotDynSym++, true);
     }
   }
 
-  std::size_t SpecGotNum = getGOTTotal(*GOT) - DtLocalGotNum - GlobalGotNum;
+  std::size_t SpecGotNum = getGOTTotal(*GOT) - *DtLocalGotNum - GlobalGotNum;
   W.printNumber("Number of TLS and multi-GOT entries", uint64_t(SpecGotNum));
 }
 
+template <class ELFT> void MipsGOTParser<ELFT>::parsePLT() {
+  if (!DtMipsPltGot) {
+    W.startLine() << "Cannot find MIPS_PLTGOT dynamic table tag.\n";
+    return;
+  }
+  if (!DtJmpRel) {
+    W.startLine() << "Cannot find JMPREL dynamic table tag.\n";
+    return;
+  }
+
+  const Elf_Shdr *PLTShdr = findSectionByAddress(Obj, *DtMipsPltGot);
+  if (!PLTShdr) {
+    W.startLine() << "There is no .got.plt section in the file.\n";
+    return;
+  }
+  ErrorOr<ArrayRef<uint8_t>> PLT = Obj->getSectionContents(PLTShdr);
+  if (!PLT) {
+    W.startLine() << "The .got.plt section is empty.\n";
+    return;
+  }
+
+  const Elf_Shdr *PLTRelShdr = findSectionByAddress(Obj, *DtJmpRel);
+  if (!PLTShdr) {
+    W.startLine() << "There is no .rel.plt section in the file.\n";
+    return;
+  }
+
+  GOTIter PLTBegin = makeGOTIter(*PLT, 0);
+  GOTIter PLTEnd = makeGOTIter(*PLT, getGOTTotal(*PLT));
+  GOTIter It = PLTBegin;
+
+  DictScope GS(W, "PLT GOT");
+  {
+    ListScope RS(W, "Reserved entries");
+    printPLTEntry(PLTShdr->sh_addr, PLTBegin, It++, "PLT lazy resolver");
+    if (It != PLTEnd)
+      printPLTEntry(PLTShdr->sh_addr, PLTBegin, It++, "Module pointer");
+  }
+  {
+    ListScope GS(W, "Entries");
+
+    switch (PLTRelShdr->sh_type) {
+    case ELF::SHT_REL:
+      for (typename ObjectFile::Elf_Rel_Iter RI = Obj->rel_begin(PLTRelShdr),
+                                             RE = Obj->rel_end(PLTRelShdr);
+           RI != RE && It != PLTEnd; ++RI, ++It) {
+        const Elf_Sym *Sym =
+            Obj->getRelocationSymbol(&*PLTRelShdr, &*RI).second;
+        printPLTEntry(PLTShdr->sh_addr, PLTBegin, It, Sym);
+      }
+      break;
+    case ELF::SHT_RELA:
+      for (typename ObjectFile::Elf_Rela_Iter RI = Obj->rela_begin(PLTRelShdr),
+                                              RE = Obj->rela_end(PLTRelShdr);
+           RI != RE && It != PLTEnd; ++RI, ++It) {
+        const Elf_Sym *Sym =
+            Obj->getRelocationSymbol(&*PLTRelShdr, &*RI).second;
+        printPLTEntry(PLTShdr->sh_addr, PLTBegin, It, Sym);
+      }
+      break;
+    }
+  }
+}
+
 template <class ELFT>
 std::size_t MipsGOTParser<ELFT>::getGOTTotal(ArrayRef<uint8_t> GOT) const {
   return GOT.size() / sizeof(GOTEntry);
@@ -1279,36 +1401,6 @@ MipsGOTParser<ELFT>::makeGOTIter(ArrayRef<uint8_t> GOT, std::size_t EntryNum) {
 }
 
 template <class ELFT>
-bool MipsGOTParser<ELFT>::getGOTTags(uint64_t &LocalGotNum, uint64_t &GotSym) {
-  bool FoundLocalGotNum = false;
-  bool FoundGotSym = false;
-  for (const auto &Entry : Obj->dynamic_table()) {
-    switch (Entry.getTag()) {
-    case ELF::DT_MIPS_LOCAL_GOTNO:
-      LocalGotNum = Entry.getVal();
-      FoundLocalGotNum = true;
-      break;
-    case ELF::DT_MIPS_GOTSYM:
-      GotSym = Entry.getVal();
-      FoundGotSym = true;
-      break;
-    }
-  }
-
-  if (!FoundLocalGotNum) {
-    W.startLine() << "Cannot find MIPS_LOCAL_GOTNO dynamic table tag.\n";
-    return false;
-  }
-
-  if (!FoundGotSym) {
-    W.startLine() << "Cannot find MIPS_GOTSYM dynamic table tag.\n";
-    return false;
-  }
-
-  return true;
-}
-
-template <class ELFT>
 void MipsGOTParser<ELFT>::printGotEntry(uint64_t GotAddr, GOTIter BeginIt,
                                         GOTIter It) {
   int64_t Offset = std::distance(BeginIt, It) * sizeof(GOTEntry);
@@ -1335,32 +1427,44 @@ void MipsGOTParser<ELFT>::printGlobalGotEntry(uint64_t GotAddr, GOTIter BeginIt,
   W.printNumber("Name", FullSymbolName, Sym->st_name);
 }
 
-template <class ELFT> void ELFDumper<ELFT>::printMipsPLTGOT() {
-  if (Obj->getHeader()->e_machine != EM_MIPS) {
-    W.startLine() << "MIPS PLT GOT is available for MIPS targets only.\n";
-    return;
-  }
+template <class ELFT>
+void MipsGOTParser<ELFT>::printPLTEntry(uint64_t PLTAddr, GOTIter BeginIt,
+                                        GOTIter It, StringRef Purpose) {
+  DictScope D(W, "Entry");
+  int64_t Offset = std::distance(BeginIt, It) * sizeof(GOTEntry);
+  W.printHex("Address", PLTAddr + Offset);
+  W.printHex("Initial", *It);
+  W.printString("Purpose", Purpose);
+}
 
-  llvm::Optional<uint64_t> DtPltGot;
-  for (const auto &Entry : Obj->dynamic_table()) {
-    if (Entry.getTag() == ELF::DT_PLTGOT) {
-      DtPltGot = Entry.getVal();
-      break;
-    }
-  }
+template <class ELFT>
+void MipsGOTParser<ELFT>::printPLTEntry(uint64_t PLTAddr, GOTIter BeginIt,
+                                        GOTIter It, const Elf_Sym *Sym) {
+  DictScope D(W, "Entry");
+  int64_t Offset = std::distance(BeginIt, It) * sizeof(GOTEntry);
+  W.printHex("Address", PLTAddr + Offset);
+  W.printHex("Initial", *It);
+  W.printHex("Value", Sym->st_value);
+  W.printEnum("Type", Sym->getType(), makeArrayRef(ElfSymbolTypes));
 
-  if (!DtPltGot) {
-    W.startLine() << "Cannot find PLTGOT dynamic table tag.\n";
-    return;
-  }
+  unsigned SectionIndex = 0;
+  StringRef SectionName;
+  getSectionNameIndex(*Obj, Sym, SectionName, SectionIndex);
+  W.printHex("Section", SectionName, SectionIndex);
 
-  const Elf_Shdr *GotShdr = findSectionByAddress(Obj, *DtPltGot);
-  if (!GotShdr) {
-    W.startLine() << "There is no .got section in the file.\n";
+  std::string FullSymbolName = getFullSymbolName(*Obj, Sym, true);
+  W.printNumber("Name", FullSymbolName, Sym->st_name);
+}
+
+template <class ELFT> void ELFDumper<ELFT>::printMipsPLTGOT() {
+  if (Obj->getHeader()->e_machine != EM_MIPS) {
+    W.startLine() << "MIPS PLT GOT is available for MIPS targets only.\n";
     return;
   }
 
-  MipsGOTParser<ELFT>(Obj, W).parseGOT(*GotShdr);
+  MipsGOTParser<ELFT> GOTParser(Obj, W);
+  GOTParser.parseGOT();
+  GOTParser.parsePLT();
 }
 
 static const EnumEntry<unsigned> ElfMipsISAExtType[] = {
diff --git a/tools/llvm-readobj/ObjDumper.h b/tools/llvm-readobj/ObjDumper.h
index 27e15b256cc5..5ecf0ec3d6fa 100644
--- a/tools/llvm-readobj/ObjDumper.h
+++ b/tools/llvm-readobj/ObjDumper.h
@@ -37,6 +37,7 @@ public:
   virtual void printDynamicTable() { }
   virtual void printNeededLibraries() { }
   virtual void printProgramHeaders() { }
+  virtual void printHashTable() { }
 
   // Only implemented for ARM ELF at this time.
   virtual void printAttributes() { }
diff --git a/tools/llvm-readobj/StreamWriter.h b/tools/llvm-readobj/StreamWriter.h
index 245588ba0600..f3cc57ef940e 100644
--- a/tools/llvm-readobj/StreamWriter.h
+++ b/tools/llvm-readobj/StreamWriter.h
@@ -181,8 +181,8 @@ public:
     startLine() << Label << ": " << (Value ? "Yes" : "No") << '\n';
   }
 
-  template <typename T_>
-  void printList(StringRef Label, const SmallVectorImpl<T_> &List) {
+  template <typename T>
+  void printList(StringRef Label, const T &List) {
     startLine() << Label << ": [";
     bool Comma = false;
     for (const auto &Item : List) {
diff --git a/tools/llvm-readobj/Win64EHDumper.cpp b/tools/llvm-readobj/Win64EHDumper.cpp
index 5a8af4135bd7..f57eea20e2d9 100644
--- a/tools/llvm-readobj/Win64EHDumper.cpp
+++ b/tools/llvm-readobj/Win64EHDumper.cpp
@@ -144,8 +144,10 @@ static std::error_code resolveRelocation(const Dumper::Context &Ctx,
           Ctx.ResolveSymbol(Section, Offset, Symbol, Ctx.UserData))
     return EC;
 
-  if (std::error_code EC = Symbol.getAddress(ResolvedAddress))
+  ErrorOr<uint64_t> ResolvedAddressOrErr = Symbol.getAddress();
+  if (std::error_code EC = ResolvedAddressOrErr.getError())
     return EC;
+  ResolvedAddress = *ResolvedAddressOrErr;
 
   section_iterator SI = Ctx.COFF.section_begin();
   if (std::error_code EC = Symbol.getSection(SI))
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index c5bccf979609..12afacb0a858 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -127,6 +127,10 @@ namespace opts {
   cl::opt<bool> ProgramHeaders("program-headers",
     cl::desc("Display ELF program headers"));
 
+  // -hash-table
+  cl::opt<bool> HashTable("hash-table",
+    cl::desc("Display ELF hash table"));
+
   // -expand-relocs
   cl::opt<bool> ExpandRelocs("expand-relocs",
     cl::desc("Expand each shown relocation to multiple lines"));
@@ -199,9 +203,7 @@ bool error(std::error_code EC) {
 }
 
 bool relocAddressLess(RelocationRef a, RelocationRef b) {
-  uint64_t a_addr = a.getOffset();
-  uint64_t b_addr = b.getOffset();
-  return a_addr < b_addr;
+  return a.getOffset() < b.getOffset();
 }
 
 } // namespace llvm
@@ -302,6 +304,8 @@ static void dumpObject(const ObjectFile *Obj) {
     Dumper->printNeededLibraries();
   if (opts::ProgramHeaders)
     Dumper->printProgramHeaders();
+  if (opts::HashTable)
+    Dumper->printHashTable();
   if (Obj->getArch() == llvm::Triple::arm && Obj->isELF())
     if (opts::ARMAttributes)
       Dumper->printAttributes();
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 98c6f5c4399c..86f66f89b159 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -115,8 +115,17 @@ TargetSectionSep("target-section-sep",
 
 static cl::list<std::string>
 SpecificSectionMappings("map-section",
-                        cl::desc("Map a section to a specific address."),
-                        cl::ZeroOrMore);
+                        cl::desc("For -verify only: Map a section to a "
+                                 "specific address."),
+                        cl::ZeroOrMore,
+                        cl::Hidden);
+
+static cl::list<std::string>
+DummySymbolMappings("dummy-extern",
+                    cl::desc("For -verify only: Inject a symbol into the extern "
+                             "symbol table."),
+                    cl::ZeroOrMore,
+                    cl::Hidden);
 
 /* *** */
 
@@ -147,10 +156,25 @@ public:
   // relocations) will get to the data cache but not to the instruction cache.
   virtual void invalidateInstructionCache();
 
+  void addDummySymbol(const std::string &Name, uint64_t Addr) {
+    DummyExterns[Name] = Addr;
+  }
+
+  RuntimeDyld::SymbolInfo findSymbol(const std::string &Name) override {
+    auto I = DummyExterns.find(Name);
+
+    if (I != DummyExterns.end())
+      return RuntimeDyld::SymbolInfo(I->second, JITSymbolFlags::Exported);
+
+    return RTDyldMemoryManager::findSymbol(Name);
+  }
+
   void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
                         size_t Size) override {}
   void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
                           size_t Size) override {}
+private:
+  std::map<std::string, uint64_t> DummyExterns;
 };
 
 uint8_t *TrivialMemoryManager::allocateCodeSection(uintptr_t Size,
@@ -269,9 +293,10 @@ static int printLineInfoForInput(bool LoadObjects, bool UseDebugObj) {
         ErrorOr<StringRef> Name = Sym.getName();
         if (!Name)
           continue;
-        uint64_t Addr;
-        if (Sym.getAddress(Addr))
+        ErrorOr<uint64_t> AddrOrErr = Sym.getAddress();
+        if (!AddrOrErr)
           continue;
+        uint64_t Addr = *AddrOrErr;
 
         uint64_t Size = P.second;
         // If we're not using the debug object, compute the address of the
@@ -400,7 +425,7 @@ applySpecificSectionMappings(RuntimeDyldChecker &Checker) {
   for (StringRef Mapping : SpecificSectionMappings) {
 
     size_t EqualsIdx = Mapping.find_first_of("=");
-    StringRef SectionIDStr = Mapping.substr(0, EqualsIdx);
+    std::string SectionIDStr = Mapping.substr(0, EqualsIdx);
     size_t ComaIdx = Mapping.find_first_of(",");
 
     if (ComaIdx == StringRef::npos) {
@@ -409,8 +434,8 @@ applySpecificSectionMappings(RuntimeDyldChecker &Checker) {
       exit(1);
     }
 
-    StringRef FileName = SectionIDStr.substr(0, ComaIdx);
-    StringRef SectionName = SectionIDStr.substr(ComaIdx + 1);
+    std::string FileName = SectionIDStr.substr(0, ComaIdx);
+    std::string SectionName = SectionIDStr.substr(ComaIdx + 1);
 
     uint64_t OldAddrInt;
     std::string ErrorMsg;
@@ -424,11 +449,11 @@ applySpecificSectionMappings(RuntimeDyldChecker &Checker) {
 
     void* OldAddr = reinterpret_cast<void*>(static_cast<uintptr_t>(OldAddrInt));
 
-    StringRef NewAddrStr = Mapping.substr(EqualsIdx + 1);
+    std::string NewAddrStr = Mapping.substr(EqualsIdx + 1);
     uint64_t NewAddr;
 
-    if (NewAddrStr.getAsInteger(0, NewAddr)) {
-      errs() << "Invalid section address in mapping: " << Mapping << "\n";
+    if (StringRef(NewAddrStr).getAsInteger(0, NewAddr)) {
+      errs() << "Invalid section address in mapping '" << Mapping << "'.\n";
       exit(1);
     }
 
@@ -450,9 +475,9 @@ applySpecificSectionMappings(RuntimeDyldChecker &Checker) {
 //                            Defaults to zero. Set to something big
 //                            (e.g. 1 << 32) to stress-test stubs, GOTs, etc.
 //
-static void remapSections(const llvm::Triple &TargetTriple,
-                          const TrivialMemoryManager &MemMgr,
-                          RuntimeDyldChecker &Checker) {
+static void remapSectionsAndSymbols(const llvm::Triple &TargetTriple,
+                                    TrivialMemoryManager &MemMgr,
+                                    RuntimeDyldChecker &Checker) {
 
   // Set up a work list (section addr/size pairs).
   typedef std::list<std::pair<void*, uint64_t>> WorklistT;
@@ -515,6 +540,27 @@ static void remapSections(const llvm::Triple &TargetTriple,
     Checker.getRTDyld().mapSectionAddress(CurEntry.first, NextSectionAddr);
   }
 
+  // Add dummy symbols to the memory manager.
+  for (const auto &Mapping : DummySymbolMappings) {
+    size_t EqualsIdx = Mapping.find_first_of("=");
+
+    if (EqualsIdx == StringRef::npos) {
+      errs() << "Invalid dummy symbol specification '" << Mapping
+             << "'. Should be '<symbol name>=<addr>'\n";
+      exit(1);
+    }
+
+    std::string Symbol = Mapping.substr(0, EqualsIdx);
+    std::string AddrStr = Mapping.substr(EqualsIdx + 1);
+
+    uint64_t Addr;
+    if (StringRef(AddrStr).getAsInteger(0, Addr)) {
+      errs() << "Invalid symbol mapping '" << Mapping << "'.\n";
+      exit(1);
+    }
+
+    MemMgr.addDummySymbol(Symbol, Addr);
+  }
 }
 
 // Load and link the objects specified on the command line, but do not execute
@@ -603,8 +649,9 @@ static int linkAndVerify() {
     }
   }
 
-  // Re-map the section addresses into the phony target address space.
-  remapSections(TheTriple, MemMgr, Checker);
+  // Re-map the section addresses into the phony target address space and add
+  // dummy symbols.
+  remapSectionsAndSymbols(TheTriple, MemMgr, Checker);
 
   // Resolve all the relocations we can.
   Dyld.resolveRelocations();
diff --git a/tools/llvm-shlib/CMakeLists.txt b/tools/llvm-shlib/CMakeLists.txt
index bc1b658ba726..54d71d3f6320 100644
--- a/tools/llvm-shlib/CMakeLists.txt
+++ b/tools/llvm-shlib/CMakeLists.txt
@@ -15,6 +15,8 @@ if(NOT DEFINED LLVM_DYLIB_COMPONENTS)
     BitWriter
     CodeGen
     Core
+    DebugInfoDWARF
+    DebugInfoPDB
     ExecutionEngine
     IPA
     IPO
diff --git a/tools/llvm-stress/llvm-stress.cpp b/tools/llvm-stress/llvm-stress.cpp
index 727d03f9d6ea..6a1a248a0572 100644
--- a/tools/llvm-stress/llvm-stress.cpp
+++ b/tools/llvm-stress/llvm-stress.cpp
@@ -31,7 +31,8 @@
 #include <set>
 #include <sstream>
 #include <vector>
-using namespace llvm;
+
+namespace llvm {
 
 static cl::opt<unsigned> SeedCL("seed",
   cl::desc("Seed used for randomness"), cl::init(0));
@@ -42,16 +43,39 @@ static cl::opt<std::string>
 OutputFilename("o", cl::desc("Override output filename"),
                cl::value_desc("filename"));
 
-static cl::opt<bool> GenHalfFloat("generate-half-float",
-  cl::desc("Generate half-length floating-point values"), cl::init(false));
-static cl::opt<bool> GenX86FP80("generate-x86-fp80",
-  cl::desc("Generate 80-bit X86 floating-point values"), cl::init(false));
-static cl::opt<bool> GenFP128("generate-fp128",
-  cl::desc("Generate 128-bit floating-point values"), cl::init(false));
-static cl::opt<bool> GenPPCFP128("generate-ppc-fp128",
-  cl::desc("Generate 128-bit PPC floating-point values"), cl::init(false));
-static cl::opt<bool> GenX86MMX("generate-x86-mmx",
-  cl::desc("Generate X86 MMX floating-point values"), cl::init(false));
+namespace cl {
+template <> class parser<Type*> final : public basic_parser<Type*> {
+public:
+  parser(Option &O) : basic_parser(O) {}
+
+  // Parse options as IR types. Return true on error.
+  bool parse(Option &O, StringRef, StringRef Arg, Type *&Value) {
+    auto &Context = getGlobalContext();
+    if      (Arg == "half")      Value = Type::getHalfTy(Context);
+    else if (Arg == "fp128")     Value = Type::getFP128Ty(Context);
+    else if (Arg == "x86_fp80")  Value = Type::getX86_FP80Ty(Context);
+    else if (Arg == "ppc_fp128") Value = Type::getPPC_FP128Ty(Context);
+    else if (Arg == "x86_mmx")   Value = Type::getX86_MMXTy(Context);
+    else if (Arg.startswith("i")) {
+      unsigned N = 0;
+      Arg.drop_front().getAsInteger(10, N);
+      if (N > 0)
+        Value = Type::getIntNTy(Context, N);
+    }
+
+    if (!Value)
+      return O.error("Invalid IR scalar type: '" + Arg + "'!");
+    return false;
+  }
+
+  const char *getValueName() const override { return "IR scalar type"; }
+};
+}
+
+
+static cl::list<Type*> AdditionalScalarTypes("types", cl::CommaSeparated,
+  cl::desc("Additional IR scalar types "
+           "(always includes i1, i8, i16, i32, i64, float and double)"));
 
 namespace {
 /// A utility class to provide a pseudo-random number generator which is
@@ -243,35 +267,22 @@ protected:
 
   /// Pick a random scalar type.
   Type *pickScalarType() {
-    Type *t = nullptr;
-    do {
-      switch (Ran->Rand() % 30) {
-      case 0: t = Type::getInt1Ty(Context); break;
-      case 1: t = Type::getInt8Ty(Context); break;
-      case 2: t = Type::getInt16Ty(Context); break;
-      case 3: case 4:
-      case 5: t = Type::getFloatTy(Context); break;
-      case 6: case 7:
-      case 8: t = Type::getDoubleTy(Context); break;
-      case 9: case 10:
-      case 11: t = Type::getInt32Ty(Context); break;
-      case 12: case 13:
-      case 14: t = Type::getInt64Ty(Context); break;
-      case 15: case 16:
-      case 17: if (GenHalfFloat) t = Type::getHalfTy(Context); break;
-      case 18: case 19:
-      case 20: if (GenX86FP80) t = Type::getX86_FP80Ty(Context); break;
-      case 21: case 22:
-      case 23: if (GenFP128) t = Type::getFP128Ty(Context); break;
-      case 24: case 25:
-      case 26: if (GenPPCFP128) t = Type::getPPC_FP128Ty(Context); break;
-      case 27: case 28:
-      case 29: if (GenX86MMX) t = Type::getX86_MMXTy(Context); break;
-      default: llvm_unreachable("Invalid scalar value");
-      }
-    } while (t == nullptr);
+    static std::vector<Type*> ScalarTypes;
+    if (ScalarTypes.empty()) {
+      ScalarTypes.assign({
+        Type::getInt1Ty(Context),
+        Type::getInt8Ty(Context),
+        Type::getInt16Ty(Context),
+        Type::getInt32Ty(Context),
+        Type::getInt64Ty(Context),
+        Type::getFloatTy(Context),
+        Type::getDoubleTy(Context)
+      });
+      ScalarTypes.insert(ScalarTypes.end(),
+        AdditionalScalarTypes.begin(), AdditionalScalarTypes.end());
+    }
 
-    return t;
+    return ScalarTypes[Ran->Rand() % ScalarTypes.size()];
   }
 
   /// Basic block to populate
@@ -665,9 +676,13 @@ static void IntroduceControlFlow(Function *F, Random &R) {
   }
 }
 
+}
+
 int main(int argc, char **argv) {
+  using namespace llvm;
+
   // Init LLVM, call llvm_shutdown() on exit, parse args, etc.
-  llvm::PrettyStackTraceProgram X(argc, argv);
+  PrettyStackTraceProgram X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv, "llvm codegen stress-tester\n");
   llvm_shutdown_obj Y;
 
diff --git a/tools/llvm-symbolizer/LLVMSymbolize.cpp b/tools/llvm-symbolizer/LLVMSymbolize.cpp
index ec3fe4868db3..c57c219b11d2 100644
--- a/tools/llvm-symbolizer/LLVMSymbolize.cpp
+++ b/tools/llvm-symbolizer/LLVMSymbolize.cpp
@@ -84,10 +84,10 @@ void ModuleInfo::addSymbol(const SymbolRef &Symbol, uint64_t SymbolSize,
   SymbolRef::Type SymbolType = Symbol.getType();
   if (SymbolType != SymbolRef::ST_Function && SymbolType != SymbolRef::ST_Data)
     return;
-  uint64_t SymbolAddress;
-  if (error(Symbol.getAddress(SymbolAddress)) ||
-      SymbolAddress == UnknownAddress)
+  ErrorOr<uint64_t> SymbolAddressOrErr = Symbol.getAddress();
+  if (error(SymbolAddressOrErr.getError()))
     return;
+  uint64_t SymbolAddress = *SymbolAddressOrErr;
   if (OpdExtractor) {
     // For big-endian PowerPC64 ELF, symbols in the .opd section refer to
     // function descriptors. The first word of the descriptor is a pointer to
diff --git a/tools/obj2yaml/elf2yaml.cpp b/tools/obj2yaml/elf2yaml.cpp
index 9afcedef6398..f117a10d3822 100644
--- a/tools/obj2yaml/elf2yaml.cpp
+++ b/tools/obj2yaml/elf2yaml.cpp
@@ -40,6 +40,7 @@ class ELFDumper {
   ErrorOr<ELFYAML::RelocationSection *> dumpRelaSection(const Elf_Shdr *Shdr);
   ErrorOr<ELFYAML::RawContentSection *>
   dumpContentSection(const Elf_Shdr *Shdr);
+  ErrorOr<ELFYAML::NoBitsSection *> dumpNoBitsSection(const Elf_Shdr *Shdr);
   ErrorOr<ELFYAML::Group *> dumpGroup(const Elf_Shdr *Shdr);
   ErrorOr<ELFYAML::MipsABIFlags *> dumpMipsABIFlags(const Elf_Shdr *Shdr);
 
@@ -104,6 +105,13 @@ ErrorOr<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
       Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(G.get()));
       break;
     }
+    case ELF::SHT_NOBITS: {
+      ErrorOr<ELFYAML::NoBitsSection *> S = dumpNoBitsSection(&Sec);
+      if (std::error_code EC = S.getError())
+        return EC;
+      Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(S.get()));
+      break;
+    }
     default: {
       ErrorOr<ELFYAML::RawContentSection *> S = dumpContentSection(&Sec);
       if (std::error_code EC = S.getError())
@@ -305,6 +313,18 @@ ELFDumper<ELFT>::dumpContentSection(const Elf_Shdr *Shdr) {
 }
 
 template <class ELFT>
+ErrorOr<ELFYAML::NoBitsSection *>
+ELFDumper<ELFT>::dumpNoBitsSection(const Elf_Shdr *Shdr) {
+  auto S = make_unique<ELFYAML::NoBitsSection>();
+
+  if (std::error_code EC = dumpCommonSection(Shdr, *S))
+    return EC;
+  S->Size = Shdr->sh_size;
+
+  return S.release();
+}
+
+template <class ELFT>
 ErrorOr<ELFYAML::Group *> ELFDumper<ELFT>::dumpGroup(const Elf_Shdr *Shdr) {
   auto S = make_unique<ELFYAML::Group>();
 
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 55426e7b2743..0db60d144409 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -324,6 +324,7 @@ int main(int argc, char **argv) {
   initializeRewriteSymbolsPass(Registry);
   initializeWinEHPreparePass(Registry);
   initializeDwarfEHPreparePass(Registry);
+  initializeSjLjEHPreparePass(Registry);
 
 #ifdef LINK_POLLY_INTO_TOOLS
   polly::initializePollyPasses(Registry);
diff --git a/tools/yaml2obj/yaml2elf.cpp b/tools/yaml2obj/yaml2elf.cpp
index 772b5b918eca..a247f48c053d 100644
--- a/tools/yaml2obj/yaml2elf.cpp
+++ b/tools/yaml2obj/yaml2elf.cpp
@@ -35,6 +35,8 @@ class ContiguousBlobAccumulator {
 
   /// \returns The new offset.
   uint64_t padToAlignment(unsigned Align) {
+    if (Align == 0)
+      Align = 1;
     uint64_t CurrentOffset = InitialOffset + OS.tell();
     uint64_t AlignedOffset = RoundUpToAlignment(CurrentOffset, Align);
     for (; CurrentOffset != AlignedOffset; ++CurrentOffset)
@@ -46,7 +48,7 @@ public:
   ContiguousBlobAccumulator(uint64_t InitialOffset_)
       : InitialOffset(InitialOffset_), Buf(), OS(Buf) {}
   template <class Integer>
-  raw_ostream &getOSAndAlignedOffset(Integer &Offset, unsigned Align = 16) {
+  raw_ostream &getOSAndAlignedOffset(Integer &Offset, unsigned Align) {
     Offset = padToAlignment(Align);
     return OS;
   }
@@ -241,6 +243,12 @@ bool ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
     } else if (auto S = dyn_cast<ELFYAML::MipsABIFlags>(Sec.get())) {
       if (!writeSectionContent(SHeader, *S, CBA))
         return false;
+    } else if (auto S = dyn_cast<ELFYAML::NoBitsSection>(Sec.get())) {
+      SHeader.sh_entsize = 0;
+      SHeader.sh_size = S->Size;
+      // SHT_NOBITS section does not have content
+      // so just to setup the section offset.
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
     } else
       llvm_unreachable("Unknown section type");
 
@@ -259,6 +267,7 @@ void ELFState<ELFT>::initSymtabSectionHeader(Elf_Shdr &SHeader,
   // One greater than symbol table index of the last local symbol.
   SHeader.sh_info = Doc.Symbols.Local.size() + 1;
   SHeader.sh_entsize = sizeof(Elf_Sym);
+  SHeader.sh_addralign = 8;
 
   std::vector<Elf_Sym> Syms;
   {
@@ -281,8 +290,9 @@ void ELFState<ELFT>::initSymtabSectionHeader(Elf_Shdr &SHeader,
   addSymbols(Doc.Symbols.Global, Syms, ELF::STB_GLOBAL);
   addSymbols(Doc.Symbols.Weak, Syms, ELF::STB_WEAK);
 
-  writeArrayData(CBA.getOSAndAlignedOffset(SHeader.sh_offset),
-                 makeArrayRef(Syms));
+  writeArrayData(
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign),
+      makeArrayRef(Syms));
   SHeader.sh_size = arrayDataSize(makeArrayRef(Syms));
 }
 
@@ -293,7 +303,8 @@ void ELFState<ELFT>::initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name,
   zero(SHeader);
   SHeader.sh_name = DotShStrtab.getOffset(Name);
   SHeader.sh_type = ELF::SHT_STRTAB;
-  CBA.getOSAndAlignedOffset(SHeader.sh_offset) << STB.data();
+  CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign)
+      << STB.data();
   SHeader.sh_size = STB.data().size();
   SHeader.sh_addralign = 1;
 }
@@ -331,7 +342,8 @@ ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                     ContiguousBlobAccumulator &CBA) {
   assert(Section.Size >= Section.Content.binary_size() &&
          "Section size and section content are inconsistent");
-  raw_ostream &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset);
+  raw_ostream &OS =
+      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
   Section.Content.writeAsBinary(OS);
   for (auto i = Section.Content.binary_size(); i < Section.Size; ++i)
     OS.write(0);
@@ -358,7 +370,7 @@ ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   SHeader.sh_entsize = IsRela ? sizeof(Elf_Rela) : sizeof(Elf_Rel);
   SHeader.sh_size = SHeader.sh_entsize * Section.Relocations.size();
 
-  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset);
+  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
 
   for (const auto &Rel : Section.Relocations) {
     unsigned SymIdx = 0;
@@ -396,7 +408,7 @@ bool ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   SHeader.sh_entsize = sizeof(Elf_Word);
   SHeader.sh_size = SHeader.sh_entsize * Section.Members.size();
 
-  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset);
+  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
 
   for (auto member : Section.Members) {
     Elf_Word SIdx;
@@ -427,7 +439,7 @@ bool ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   SHeader.sh_entsize = sizeof(Flags);
   SHeader.sh_size = SHeader.sh_entsize;
 
-  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset);
+  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
   Flags.version = Section.Version;
   Flags.isa_level = Section.ISALevel;
   Flags.isa_rev = Section.ISARevision;
diff --git a/unittests/ADT/TripleTest.cpp b/unittests/ADT/TripleTest.cpp
index d50768412d74..2b1e871b94cb 100644
--- a/unittests/ADT/TripleTest.cpp
+++ b/unittests/ADT/TripleTest.cpp
@@ -553,6 +553,84 @@ TEST(TripleTest, BitWidthArchVariants) {
   EXPECT_EQ(Triple::wasm64, T.get64BitArchVariant().getArch());
 }
 
+TEST(TripleTest, EndianArchVariants) {
+  Triple T;
+  EXPECT_EQ(Triple::UnknownArch, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::UnknownArch, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::UnknownArch);
+  EXPECT_EQ(Triple::UnknownArch, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::UnknownArch, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::aarch64_be);
+  EXPECT_EQ(Triple::aarch64_be, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::aarch64, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::aarch64);
+  EXPECT_EQ(Triple::aarch64_be, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::aarch64, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::armeb);
+  EXPECT_EQ(Triple::armeb, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::UnknownArch, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::arm);
+  EXPECT_EQ(Triple::UnknownArch, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::arm, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::bpfeb);
+  EXPECT_EQ(Triple::bpfeb, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::bpfel, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::bpfel);
+  EXPECT_EQ(Triple::bpfeb, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::bpfel, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::mips64);
+  EXPECT_EQ(Triple::mips64, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::mips64el, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::mips64el);
+  EXPECT_EQ(Triple::mips64, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::mips64el, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::mips);
+  EXPECT_EQ(Triple::mips, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::mipsel, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::mipsel);
+  EXPECT_EQ(Triple::mips, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::mipsel, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::ppc);
+  EXPECT_EQ(Triple::ppc, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::UnknownArch, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::ppc64);
+  EXPECT_EQ(Triple::ppc64, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::ppc64le, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::ppc64le);
+  EXPECT_EQ(Triple::ppc64, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::ppc64le, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::sparc);
+  EXPECT_EQ(Triple::sparc, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::sparcel, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::sparcel);
+  EXPECT_EQ(Triple::sparc, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::sparcel, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::thumb);
+  EXPECT_EQ(Triple::UnknownArch, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::thumb, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::thumbeb);
+  EXPECT_EQ(Triple::thumbeb, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::UnknownArch, T.getLittleEndianArchVariant().getArch());
+}
+
 TEST(TripleTest, getOSVersion) {
   Triple T;
   unsigned Major, Minor, Micro;
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
index f65ec96d944b..01e796d9a4ea 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/Support/DynamicLibrary.h"
 #include "MCJITTestBase.h"
 #include "gtest/gtest.h"
 
@@ -199,4 +200,85 @@ TEST_F(MCJITTest, multiple_decl_lookups) {
   EXPECT_EQ(A, B) << "Repeat calls to getPointerToFunction fail.";
 }
 
+typedef void * (*FunctionHandlerPtr)(const std::string &str);
+
+TEST_F(MCJITTest, lazy_function_creator_pointer) {
+  SKIP_UNSUPPORTED_PLATFORM;
+  
+  Function *Foo = insertExternalReferenceToFunction<int32_t(void)>(M.get(),
+                                                                   "\1Foo");
+  startFunction<int32_t(void)>(M.get(), "Parent");
+  CallInst *Call = Builder.CreateCall(Foo, {});
+  Builder.CreateRet(Call);
+  
+  createJIT(std::move(M));
+  
+  // Set up the lazy function creator that records the name of the last
+  // unresolved external function found in the module. Using a function pointer
+  // prevents us from capturing local variables, which is why this is static.
+  static std::string UnresolvedExternal;
+  FunctionHandlerPtr UnresolvedHandler = [] (const std::string &str) {
+    // Try to resolve the function in the current process before marking it as
+    // unresolved. This solves an issue on ARM where '__aeabi_*' function names
+    // are passed to this handler.
+    void *symbol =
+        llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(str.c_str());
+    if (symbol) {
+      return symbol;
+    }
+    
+    UnresolvedExternal = str;
+    return (void *)(uintptr_t)-1;
+  };
+  TheJIT->InstallLazyFunctionCreator(UnresolvedHandler);
+  
+  // JIT the module.
+  TheJIT->finalizeObject();
+  
+  // Verify that our handler was called.
+  EXPECT_EQ(UnresolvedExternal, "Foo");
+}
+
+TEST_F(MCJITTest, lazy_function_creator_lambda) {
+  SKIP_UNSUPPORTED_PLATFORM;
+  
+  Function *Foo1 = insertExternalReferenceToFunction<int32_t(void)>(M.get(),
+                                                                   "\1Foo1");
+  Function *Foo2 = insertExternalReferenceToFunction<int32_t(void)>(M.get(),
+                                                                   "\1Foo2");
+  startFunction<int32_t(void)>(M.get(), "Parent");
+  CallInst *Call1 = Builder.CreateCall(Foo1, {});
+  CallInst *Call2 = Builder.CreateCall(Foo2, {});
+  Value *Result = Builder.CreateAdd(Call1, Call2);
+  Builder.CreateRet(Result);
+  
+  createJIT(std::move(M));
+  
+  // Set up the lazy function creator that records the name of unresolved
+  // external functions in the module.
+  std::vector<std::string> UnresolvedExternals;
+  auto UnresolvedHandler = [&UnresolvedExternals] (const std::string &str) {
+    // Try to resolve the function in the current process before marking it as
+    // unresolved. This solves an issue on ARM where '__aeabi_*' function names
+    // are passed to this handler.
+    void *symbol =
+        llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(str.c_str());
+    if (symbol) {
+      return symbol;
+    }
+    UnresolvedExternals.push_back(str);
+    return (void *)(uintptr_t)-1;
+  };
+  TheJIT->InstallLazyFunctionCreator(UnresolvedHandler);
+  
+  // JIT the module.
+  TheJIT->finalizeObject();
+  
+  // Verify that our handler was called for each unresolved function.
+  auto I = UnresolvedExternals.begin(), E = UnresolvedExternals.end();
+  EXPECT_EQ(UnresolvedExternals.size(), 2u);
+  EXPECT_FALSE(std::find(I, E, "Foo1") == E);
+  EXPECT_FALSE(std::find(I, E, "Foo2") == E);
+}
+
 }
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index f3db68feacce..093cbbfc7790 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -130,8 +130,8 @@ TEST_F(IRBuilderTest, GetIntTy) {
 
 TEST_F(IRBuilderTest, FastMathFlags) {
   IRBuilder<> Builder(BB);
-  Value *F;
-  Instruction *FDiv, *FAdd;
+  Value *F, *FC;
+  Instruction *FDiv, *FAdd, *FCmp;
 
   F = Builder.CreateLoad(GV);
   F = Builder.CreateFAdd(F, F);
@@ -190,6 +190,24 @@ TEST_F(IRBuilderTest, FastMathFlags) {
 
   Builder.clearFastMathFlags();
 
+  FC = Builder.CreateFCmpOEQ(F, F);
+  ASSERT_TRUE(isa<Instruction>(FC));
+  FCmp = cast<Instruction>(FC);
+  EXPECT_FALSE(FCmp->hasAllowReciprocal());
+
+  FMF.clear();
+  FMF.setAllowReciprocal();
+  Builder.SetFastMathFlags(FMF);
+
+  FC = Builder.CreateFCmpOEQ(F, F);
+  EXPECT_TRUE(Builder.getFastMathFlags().any());
+  EXPECT_TRUE(Builder.getFastMathFlags().AllowReciprocal);
+  ASSERT_TRUE(isa<Instruction>(FC));
+  FCmp = cast<Instruction>(FC);
+  EXPECT_TRUE(FCmp->hasAllowReciprocal());
+
+  Builder.clearFastMathFlags();
+
   // To test a copy, make sure that a '0' and a '1' change state. 
   F = Builder.CreateFDiv(F, F);
   ASSERT_TRUE(isa<Instruction>(F));
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index 47d68fc339ae..661975ecb202 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -297,7 +297,7 @@ void CodeGenTarget::ComputeInstrsByEnum() const {
       "IMPLICIT_DEF", "SUBREG_TO_REG", "COPY_TO_REGCLASS", "DBG_VALUE",
       "REG_SEQUENCE", "COPY",          "BUNDLE",           "LIFETIME_START",
       "LIFETIME_END", "STACKMAP",      "PATCHPOINT",       "LOAD_STACK_GUARD",
-      "STATEPOINT",   "FRAME_ALLOC",   "FAULTING_LOAD_OP",
+      "STATEPOINT",   "LOCAL_ESCAPE",   "FAULTING_LOAD_OP",
       nullptr};
   const auto &Insts = getInstructions();
   for (const char *const *p = FixedInstrs; *p; ++p) {
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index 36a2183fc8b0..c4df2833885a 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -208,7 +208,7 @@ typedef std::vector<bit_value_t> insn_t;
 ///
 /// The Debug output shows the path that the decoding tree follows to reach the
 /// the conclusion that there is a conflict.  VST4q8a is a vst4 to double-spaced
-/// even registers, while VST4q8b is a vst4 to double-spaced odd regsisters.
+/// even registers, while VST4q8b is a vst4 to double-spaced odd registers.
 ///
 /// The encoding info in the .td files does not specify this meta information,
 /// which could have been used by the decoder to resolve the conflict.  The
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 7506e91b6f45..9619fb9e356e 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -1070,6 +1070,8 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
 
   OS << "namespace llvm {\n\n";
 
+  OS << "class " << TargetName << "FrameLowering;\n\n";
+
   OS << "struct " << ClassName << " : public TargetRegisterInfo {\n"
      << "  explicit " << ClassName
      << "(unsigned RA, unsigned D = 0, unsigned E = 0, unsigned PC = 0);\n"
@@ -1096,6 +1098,9 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
      << "unsigned RegUnit) const override;\n"
      << "  ArrayRef<const char *> getRegMaskNames() const override;\n"
      << "  ArrayRef<const uint32_t *> getRegMasks() const override;\n"
+     << "  /// Devirtualized TargetFrameLowering.\n"
+     << "  static const " << TargetName << "FrameLowering *getFrameLowering(\n"
+     << "      const MachineFunction &MF);\n"
      << "};\n\n";
 
   const auto &RegisterClasses = RegBank.getRegClasses();
@@ -1467,6 +1472,13 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
      << ");\n";
   OS << "}\n\n";
 
+  OS << "const " << TargetName << "FrameLowering *"
+     << TargetName << "GenRegisterInfo::\n"
+     << "    getFrameLowering(const MachineFunction &MF) {\n"
+     << "  return static_cast<const " << TargetName << "FrameLowering *>(\n"
+     << "      MF.getSubtarget().getFrameLowering());\n"
+     << "}\n\n";
+
   OS << "} // End llvm namespace\n";
   OS << "#endif // GET_REGINFO_TARGET_DESC\n\n";
 }
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index e5d75771cc80..03d7f4e868e7 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -1435,10 +1435,10 @@ void SubtargetEmitter::run(raw_ostream &OS) {
 #endif
 
   // MCInstrInfo initialization routine.
-  OS << "static inline void Init" << Target
-     << "MCSubtargetInfo(MCSubtargetInfo *II, "
+  OS << "static inline MCSubtargetInfo *create" << Target
+     << "MCSubtargetInfoImpl("
      << "const Triple &TT, StringRef CPU, StringRef FS) {\n";
-  OS << "  II->InitMCSubtargetInfo(TT, CPU, FS, ";
+  OS << "  return new MCSubtargetInfo(TT, CPU, FS, ";
   if (NumFeatures)
     OS << Target << "FeatureKV, ";
   else
@@ -1518,8 +1518,7 @@ void SubtargetEmitter::run(raw_ostream &OS) {
 
   OS << ClassName << "::" << ClassName << "(const Triple &TT, StringRef CPU, "
      << "StringRef FS)\n"
-     << "  : TargetSubtargetInfo() {\n"
-     << "  InitMCSubtargetInfo(TT, CPU, FS, ";
+     << "  : TargetSubtargetInfo(TT, CPU, FS, ";
   if (NumFeatures)
     OS << "makeArrayRef(" << Target << "FeatureKV, " << NumFeatures << "), ";
   else
@@ -1528,19 +1527,19 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << "makeArrayRef(" << Target << "SubTypeKV, " << NumProcs << "), ";
   else
     OS << "None, ";
-  OS << '\n'; OS.indent(22);
+  OS << '\n'; OS.indent(24);
   OS << Target << "ProcSchedKV, "
      << Target << "WriteProcResTable, "
      << Target << "WriteLatencyTable, "
      << Target << "ReadAdvanceTable, ";
-  OS << '\n'; OS.indent(22);
+  OS << '\n'; OS.indent(24);
   if (SchedModels.hasItineraries()) {
     OS << Target << "Stages, "
        << Target << "OperandCycles, "
        << Target << "ForwardingPaths";
   } else
     OS << "0, 0, 0";
-  OS << ");\n}\n\n";
+  OS << ") {}\n\n";
 
   EmitSchedModelHelpers(ClassName, OS);
 
diff --git a/utils/TableGen/X86DisassemblerTables.cpp b/utils/TableGen/X86DisassemblerTables.cpp
index f59652c79e74..ad36dc427a56 100644
--- a/utils/TableGen/X86DisassemblerTables.cpp
+++ b/utils/TableGen/X86DisassemblerTables.cpp
@@ -243,6 +243,9 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_EVEX_OPSIZE_KZ_B:
     return false;
   case IC_EVEX_W_K:
+  case IC_EVEX_W_B:
+  case IC_EVEX_W_K_B:
+  case IC_EVEX_W_KZ_B:
   case IC_EVEX_W_XS_K:
   case IC_EVEX_W_XD_K:
   case IC_EVEX_W_OPSIZE_K:
@@ -252,6 +255,8 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_EVEX_L_K:
   case IC_EVEX_L_XS_K:
   case IC_EVEX_L_XD_K:
+  case IC_EVEX_L_XD_B:
+  case IC_EVEX_L_XD_K_B:
   case IC_EVEX_L_OPSIZE_K:
   case IC_EVEX_L_OPSIZE_B:
   case IC_EVEX_L_OPSIZE_K_B:
@@ -272,21 +277,30 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_EVEX_L_XS_KZ:
   case IC_EVEX_L_XS_B:
   case IC_EVEX_L_XS_K_B:
+  case IC_EVEX_L_XS_KZ_B:
   case IC_EVEX_L_XD_KZ:
+  case IC_EVEX_L_XD_KZ_B:
   case IC_EVEX_L_OPSIZE_KZ:
   case IC_EVEX_L_OPSIZE_KZ_B:
     return false;
   case IC_EVEX_L_W_K:
+  case IC_EVEX_L_W_B:
+  case IC_EVEX_L_W_K_B:    
   case IC_EVEX_L_W_XS_K:
   case IC_EVEX_L_W_XS_B:
   case IC_EVEX_L_W_XS_K_B:
-  case IC_EVEX_L_W_XD_K:
+  case IC_EVEX_L_W_XS_KZ:
+  case IC_EVEX_L_W_XS_KZ_B:
   case IC_EVEX_L_W_OPSIZE_K:
   case IC_EVEX_L_W_OPSIZE_B:
   case IC_EVEX_L_W_OPSIZE_K_B:
   case IC_EVEX_L_W_KZ:
-  case IC_EVEX_L_W_XS_KZ:
+  case IC_EVEX_L_W_KZ_B:
+  case IC_EVEX_L_W_XD_K:
+  case IC_EVEX_L_W_XD_B:
+  case IC_EVEX_L_W_XD_K_B:
   case IC_EVEX_L_W_XD_KZ:
+  case IC_EVEX_L_W_XD_KZ_B:
   case IC_EVEX_L_W_OPSIZE_KZ:
   case IC_EVEX_L_W_OPSIZE_KZ_B:
     return false;
@@ -299,17 +313,22 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_EVEX_L2_XS_B:
   case IC_EVEX_L2_XD_B:
   case IC_EVEX_L2_XD_K:
+  case IC_EVEX_L2_XD_K_B:
   case IC_EVEX_L2_OPSIZE_K:
   case IC_EVEX_L2_OPSIZE_B:
   case IC_EVEX_L2_OPSIZE_K_B:
   case IC_EVEX_L2_KZ:
   case IC_EVEX_L2_XS_KZ:
+  case IC_EVEX_L2_XS_KZ_B:
   case IC_EVEX_L2_XD_KZ:
+  case IC_EVEX_L2_XD_KZ_B:
   case IC_EVEX_L2_OPSIZE_KZ:
   case IC_EVEX_L2_OPSIZE_KZ_B:
     return false;
   case IC_EVEX_L2_W_K:
   case IC_EVEX_L2_W_B:
+  case IC_EVEX_L2_W_K_B:
+  case IC_EVEX_L2_W_KZ_B:
   case IC_EVEX_L2_W_XS_K:
   case IC_EVEX_L2_W_XS_B:
   case IC_EVEX_L2_W_XS_K_B:
@@ -320,7 +339,10 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_EVEX_L2_W_OPSIZE_K_B:
   case IC_EVEX_L2_W_KZ:
   case IC_EVEX_L2_W_XS_KZ:
+  case IC_EVEX_L2_W_XS_KZ_B:
   case IC_EVEX_L2_W_XD_KZ:
+  case IC_EVEX_L2_W_XD_K_B:
+  case IC_EVEX_L2_W_XD_KZ_B:
   case IC_EVEX_L2_W_OPSIZE_KZ:
   case IC_EVEX_L2_W_OPSIZE_KZ_B:
     return false;
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index 04127c8eb982..ee87b3f1987f 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -29,7 +29,6 @@ RC=""
 Triple=""
 use_gzip="no"
 do_checkout="yes"
-do_clang="yes"
 do_64bit="yes"
 do_debug="no"
 do_asserts="no"
@@ -48,7 +47,6 @@ function usage() {
     echo " -build-dir DIR       Directory to perform testing in. [default: pwd]"
     echo " -no-checkout         Don't checkout the sources from SVN."
     echo " -no-64bit            Don't test the 64-bit version. [default: yes]"
-    echo " -disable-clang       Do not test clang. [default: enable]"
     echo " -test-debug          Test the debug build. [default: no]"
     echo " -test-asserts        Test with asserts on. [default: no]"
     echo " -no-compare-files    Don't test that phase 2 and 3 files are identical."
@@ -96,9 +94,6 @@ while [ $# -gt 0 ]; do
         -no-64bit | --no-64bit )
             do_64bit="no"
             ;;
-        -disable-clang | --disable-clang )
-            do_clang="no"
-            ;;
         -test-debug | --test-debug )
             do_debug="yes"
             ;;
@@ -236,7 +231,6 @@ function configure_llvmCore() {
     Phase="$1"
     Flavor="$2"
     ObjDir="$3"
-    InstallDir="$4"
 
     case $Flavor in
         Release | Release-64 )
@@ -265,13 +259,13 @@ function configure_llvmCore() {
 
     cd $ObjDir
     echo "# Configuring llvm $Release-$RC $Flavor"
-    echo "# $BuildDir/llvm.src/configure --prefix=$InstallDir \
+    echo "# $BuildDir/llvm.src/configure \
         --enable-optimized=$Optimized \
         --enable-assertions=$Assertions \
         --disable-timestamps \
         $build_triple_option"
     env CC="$c_compiler" CXX="$cxx_compiler" \
-        $BuildDir/llvm.src/configure --prefix=$InstallDir \
+        $BuildDir/llvm.src/configure \
         --enable-optimized=$Optimized \
         --enable-assertions=$Assertions \
         --disable-timestamps \
@@ -284,6 +278,7 @@ function build_llvmCore() {
     Phase="$1"
     Flavor="$2"
     ObjDir="$3"
+    DestDir="$4"
     ExtraOpts=""
 
     if [ "$Flavor" = "Release-64" ]; then
@@ -299,6 +294,7 @@ function build_llvmCore() {
     echo "# Installing llvm $Release-$RC $Flavor"
     echo "# ${MAKE} install"
     ${MAKE} install \
+        DESTDIR="${DestDir}" \
         2>&1 | tee $LogDir/llvm.install-Phase$Phase-$Flavor.log
     cd $BuildDir
 }
@@ -348,7 +344,11 @@ function package_release() {
     cd $cwd
 }
 
-set -e                          # Exit if any command fails
+# Exit if any command fails
+# Note: pipefail is necessary for running build commands through
+# a pipe (i.e. it changes the output of ``false | tee /dev/null ; echo $?``)
+set -e
+set -o pipefail
 
 if [ "$do_checkout" = "yes" ]; then
     export_sources
@@ -381,89 +381,77 @@ for Flavor in $Flavors ; do
     cxx_compiler="$CXX"
 
     llvmCore_phase1_objdir=$BuildDir/Phase1/$Flavor/llvmCore-$Release-$RC.obj
-    llvmCore_phase1_installdir=$BuildDir/Phase1/$Flavor/llvmCore-$Release-$RC.install
+    llvmCore_phase1_destdir=$BuildDir/Phase1/$Flavor/llvmCore-$Release-$RC.install
 
     llvmCore_phase2_objdir=$BuildDir/Phase2/$Flavor/llvmCore-$Release-$RC.obj
-    llvmCore_phase2_installdir=$BuildDir/Phase2/$Flavor/llvmCore-$Release-$RC.install
+    llvmCore_phase2_destdir=$BuildDir/Phase2/$Flavor/llvmCore-$Release-$RC.install
 
     llvmCore_phase3_objdir=$BuildDir/Phase3/$Flavor/llvmCore-$Release-$RC.obj
-    llvmCore_phase3_installdir=$BuildDir/Phase3/$Flavor/llvmCore-$Release-$RC.install
+    llvmCore_phase3_destdir=$BuildDir/Phase3/$Flavor/llvmCore-$Release-$RC.install
 
     rm -rf $llvmCore_phase1_objdir
-    rm -rf $llvmCore_phase1_installdir
+    rm -rf $llvmCore_phase1_destdir
 
     rm -rf $llvmCore_phase2_objdir
-    rm -rf $llvmCore_phase2_installdir
+    rm -rf $llvmCore_phase2_destdir
 
     rm -rf $llvmCore_phase3_objdir
-    rm -rf $llvmCore_phase3_installdir
+    rm -rf $llvmCore_phase3_destdir
 
     mkdir -p $llvmCore_phase1_objdir
-    mkdir -p $llvmCore_phase1_installdir
+    mkdir -p $llvmCore_phase1_destdir
 
     mkdir -p $llvmCore_phase2_objdir
-    mkdir -p $llvmCore_phase2_installdir
+    mkdir -p $llvmCore_phase2_destdir
 
     mkdir -p $llvmCore_phase3_objdir
-    mkdir -p $llvmCore_phase3_installdir
+    mkdir -p $llvmCore_phase3_destdir
 
     ############################################################################
     # Phase 1: Build llvmCore and clang
     echo "# Phase 1: Building llvmCore"
-    configure_llvmCore 1 $Flavor \
-        $llvmCore_phase1_objdir $llvmCore_phase1_installdir
+    configure_llvmCore 1 $Flavor $llvmCore_phase1_objdir
     build_llvmCore 1 $Flavor \
-        $llvmCore_phase1_objdir
-    clean_RPATH $llvmCore_phase1_installdir
-
-    # Test clang
-    if [ "$do_clang" = "yes" ]; then
-        ########################################################################
-        # Phase 2: Build llvmCore with newly built clang from phase 1.
-        c_compiler=$llvmCore_phase1_installdir/bin/clang
-        cxx_compiler=$llvmCore_phase1_installdir/bin/clang++
-        echo "# Phase 2: Building llvmCore"
-        configure_llvmCore 2 $Flavor \
-            $llvmCore_phase2_objdir $llvmCore_phase2_installdir
-        build_llvmCore 2 $Flavor \
-            $llvmCore_phase2_objdir
-        clean_RPATH $llvmCore_phase2_installdir
-
-        ########################################################################
-        # Phase 3: Build llvmCore with newly built clang from phase 2.
-        c_compiler=$llvmCore_phase2_installdir/bin/clang
-        cxx_compiler=$llvmCore_phase2_installdir/bin/clang++
-        echo "# Phase 3: Building llvmCore"
-        configure_llvmCore 3 $Flavor \
-            $llvmCore_phase3_objdir $llvmCore_phase3_installdir
-        build_llvmCore 3 $Flavor \
-            $llvmCore_phase3_objdir
-        clean_RPATH $llvmCore_phase3_installdir
-
-        ########################################################################
-        # Testing: Test phase 3
-        echo "# Testing - built with clang"
-        test_llvmCore 3 $Flavor $llvmCore_phase3_objdir
-
-        ########################################################################
-        # Compare .o files between Phase2 and Phase3 and report which ones
-        # differ.
-        if [ "$do_compare" = "yes" ]; then
-            echo
-            echo "# Comparing Phase 2 and Phase 3 files"
-            for o in `find $llvmCore_phase2_objdir -name '*.o'` ; do
-                p3=`echo $o | sed -e 's,Phase2,Phase3,'`
-                if ! cmp --ignore-initial=16 $o $p3 > /dev/null 2>&1 ; then
-                    echo "file `basename $o` differs between phase 2 and phase 3"
-                fi
-            done
-        fi
-    fi
-
-    # Otherwise just test the core.
-    if [ "$do_clang" != "yes" ]; then
-        echo "# Testing - built with system compiler"
-        test_llvmCore 1 $Flavor $llvmCore_phase1_objdir
+        $llvmCore_phase1_objdir $llvmCore_phase1_destdir
+    clean_RPATH $llvmCore_phase1_destdir/usr/local
+
+    ########################################################################
+    # Phase 2: Build llvmCore with newly built clang from phase 1.
+    c_compiler=$llvmCore_phase1_destdir/usr/local/bin/clang
+    cxx_compiler=$llvmCore_phase1_destdir/usr/local/bin/clang++
+    echo "# Phase 2: Building llvmCore"
+    configure_llvmCore 2 $Flavor $llvmCore_phase2_objdir
+    build_llvmCore 2 $Flavor \
+        $llvmCore_phase2_objdir $llvmCore_phase2_destdir
+    clean_RPATH $llvmCore_phase2_destdir/usr/local
+
+    ########################################################################
+    # Phase 3: Build llvmCore with newly built clang from phase 2.
+    c_compiler=$llvmCore_phase2_destdir/usr/local/bin/clang
+    cxx_compiler=$llvmCore_phase2_destdir/usr/local/bin/clang++
+    echo "# Phase 3: Building llvmCore"
+    configure_llvmCore 3 $Flavor $llvmCore_phase3_objdir
+    build_llvmCore 3 $Flavor \
+        $llvmCore_phase3_objdir $llvmCore_phase3_destdir
+    clean_RPATH $llvmCore_phase3_destdir/usr/local
+
+    ########################################################################
+    # Testing: Test phase 3
+    echo "# Testing - built with clang"
+    test_llvmCore 3 $Flavor $llvmCore_phase3_objdir
+
+    ########################################################################
+    # Compare .o files between Phase2 and Phase3 and report which ones
+    # differ.
+    if [ "$do_compare" = "yes" ]; then
+        echo
+        echo "# Comparing Phase 2 and Phase 3 files"
+        for o in `find $llvmCore_phase2_objdir -name '*.o'` ; do
+            p3=`echo $o | sed -e 's,Phase2,Phase3,'`
+            if ! cmp --ignore-initial=16 $o $p3 > /dev/null 2>&1 ; then
+                echo "file `basename $o` differs between phase 2 and phase 3"
+            fi
+        done
     fi
 done
 ) 2>&1 | tee $LogDir/testing.$Release-$RC.log
diff --git a/utils/unittest/UnitTestMain/TestMain.cpp b/utils/unittest/UnitTestMain/TestMain.cpp
index f5b09a5cf673..fb2b0f16ee3f 100644
--- a/utils/unittest/UnitTestMain/TestMain.cpp
+++ b/utils/unittest/UnitTestMain/TestMain.cpp
@@ -7,13 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Config/config.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Signals.h"
 #include "gtest/gtest.h"
 
 
-#if defined(LLVM_ON_WIN32)
+#if defined(_WIN32)
 # include <windows.h>
 # if defined(_MSC_VER)
 #   include <crtdbg.h>
@@ -30,7 +29,7 @@ int main(int argc, char **argv) {
   // Make it easy for a test to re-execute itself by saving argv[0].
   TestMainArgv0 = argv[0];
 
-# if defined(LLVM_ON_WIN32)
+# if defined(_WIN32)
   // Disable all of the possible ways Windows conspires to make automated
   // testing impossible.
   ::SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOGPFAULTERRORBOX);