182 files changed, 3774 insertions, 1315 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3194197d7a16..ac3b978ee6f5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,7 +61,7 @@ set(CMAKE_MODULE_PATH
 set(LLVM_VERSION_MAJOR 3)
 set(LLVM_VERSION_MINOR 7)
 set(LLVM_VERSION_PATCH 0)
-set(LLVM_VERSION_SUFFIX svn)
+set(LLVM_VERSION_SUFFIX "")
 
 if (NOT PACKAGE_VERSION)
   set(PACKAGE_VERSION
@@ -518,7 +518,7 @@ if (APPLE)
 else(UNIX)
   if(NOT DEFINED CMAKE_INSTALL_RPATH)
     set(CMAKE_INSTALL_RPATH "\$ORIGIN/../lib${LLVM_LIBDIR_SUFFIX}")
-    if (${CMAKE_SYSTEM_NAME} MATCHES FreeBSD)
+    if(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)")
       set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,origin")
       set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,origin")
     endif()
@@ -544,12 +544,12 @@ if(LLVM_USE_HOST_TOOLS)
   include(CrossCompile)
 endif(LLVM_USE_HOST_TOOLS)
 
-if( ${CMAKE_SYSTEM_NAME} MATCHES FreeBSD )
+if(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)")
   # On FreeBSD, /usr/local/* is not used by default. In order to build LLVM
   # with libxml2, iconv.h, etc., we must add /usr/local paths.
   include_directories("/usr/local/include")
   link_directories("/usr/local/lib")
-endif( ${CMAKE_SYSTEM_NAME} MATCHES FreeBSD )
+endif(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)")
 
 if( ${CMAKE_SYSTEM_NAME} MATCHES SunOS )
    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -include llvm/Support/Solaris.h")
diff --git a/CREDITS.TXT b/CREDITS.TXT
index da1fb010e35b..7cdd97c309a6 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -465,3 +465,47 @@ N: Bob Wilson
 E: bob.wilson@acm.org
 D: Advanced SIMD (NEON) support in the ARM backend.
 
+N: Alexey Bataev
+E: a.bataev@hotmail.com
+D: Clang OpenMP implementation
+
+N: Andrey Bokhanko
+E: andreybokhanko@gmail.com 
+D: Clang OpenMP implementation
+
+N: Carlo Bertolli
+E: cbertol@us.ibm.com 
+D: Clang OpenMP implementation
+
+N: Eric Stotzer
+E: estotzer@ti.com 
+D: Clang OpenMP implementation
+
+N: Kelvin Li
+E: kkwli0@gmail.com 
+D: Clang OpenMP implementation
+
+N: Samuel Antao
+E: sfantao@us.ibm.com 
+D: Clang OpenMP implementation
+
+N: Sergey Ostanevich
+E: sergos.gnu@gmail.com 
+D: Clang OpenMP implementation
+
+N: Alexandre Eichenberger
+E: alexe@us.ibm.com 
+D: Clang OpenMP implementation
+
+N: Guansong Zhang
+E: guansong.zhang@amd.com 
+D: Clang OpenMP implementation
+
+N: Sunita Chandrasekaran
+E: sunisg123@gmail.com  
+D: Clang OpenMP implementation
+
+N: Michael Wong
+E: fraggamuffin@gmail.com 
+D: Clang OpenMP implementation
+
diff --git a/Makefile.config.in b/Makefile.config.in
index 3258714ac590..9df9834f4dbe 100644
--- a/Makefile.config.in
+++ b/Makefile.config.in
@@ -58,7 +58,7 @@ LLVM_OBJ_ROOT   := $(call realpath, @abs_top_builddir@)
 PROJ_SRC_ROOT   := $(LLVM_SRC_ROOT)
 PROJ_SRC_DIR    := $(LLVM_SRC_ROOT)$(patsubst $(PROJ_OBJ_ROOT)%,%,$(PROJ_OBJ_DIR))
 
-# See: http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20150323/268067.html
+# See: http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20150323/268067.html
 ifeq ($(LLVM_SRC_ROOT), $(LLVM_OBJ_ROOT))
   $(error In-source builds are not allowed. Please configure from a separate build directory!)
 endif
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index d6778ac02503..74ebea2f5a7a 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -32,12 +32,12 @@ dnl===-----------------------------------------------------------------------===
 dnl Initialize autoconf and define the package name, version number and
 dnl address for reporting bugs.
 
-AC_INIT([LLVM],[3.7.0svn],[http://llvm.org/bugs/])
+AC_INIT([LLVM],[3.7.0],[http://llvm.org/bugs/])
 
 LLVM_VERSION_MAJOR=3
 LLVM_VERSION_MINOR=7
 LLVM_VERSION_PATCH=0
-LLVM_VERSION_SUFFIX=svn
+LLVM_VERSION_SUFFIX=
 
 AC_DEFINE_UNQUOTED([LLVM_VERSION_MAJOR], $LLVM_VERSION_MAJOR, [Major version of the LLVM API])
 AC_DEFINE_UNQUOTED([LLVM_VERSION_MINOR], $LLVM_VERSION_MINOR, [Minor version of the LLVM API])
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 9f5a3a0a1bca..4db27033b203 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -131,7 +131,7 @@ endif()
 
 # Pass -Wl,-z,defs. This makes sure all symbols are defined. Otherwise a DSO
 # build might work on ELF but fail on MachO/COFF.
-if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin" OR WIN32 OR
+if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin" OR WIN32 OR CYGWIN OR
         ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") AND
    NOT LLVM_USE_SANITIZER)
   set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,defs")
diff --git a/configure b/configure
index a5acfde32f2a..c562f830b3ae 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.60 for LLVM 3.7.0svn.
+# Generated by GNU Autoconf 2.60 for LLVM 3.7.0.
 #
 # Report bugs to <http://llvm.org/bugs/>.
 #
@@ -561,8 +561,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='LLVM'
 PACKAGE_TARNAME='llvm'
-PACKAGE_VERSION='3.7.0svn'
-PACKAGE_STRING='LLVM 3.7.0svn'
+PACKAGE_VERSION='3.7.0'
+PACKAGE_STRING='LLVM 3.7.0'
 PACKAGE_BUGREPORT='http://llvm.org/bugs/'
 
 ac_unique_file="lib/IR/Module.cpp"
@@ -1333,7 +1333,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures LLVM 3.7.0svn to adapt to many kinds of systems.
+\`configure' configures LLVM 3.7.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1399,7 +1399,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of LLVM 3.7.0svn:";;
+     short | recursive ) echo "Configuration of LLVM 3.7.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1583,7 +1583,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-LLVM configure 3.7.0svn
+LLVM configure 3.7.0
 generated by GNU Autoconf 2.60
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1599,7 +1599,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by LLVM $as_me 3.7.0svn, which was
+It was created by LLVM $as_me 3.7.0, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   $ $0 $@
@@ -1956,7 +1956,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 LLVM_VERSION_MAJOR=3
 LLVM_VERSION_MINOR=7
 LLVM_VERSION_PATCH=0
-LLVM_VERSION_SUFFIX=svn
+LLVM_VERSION_SUFFIX=
 
 
 cat >>confdefs.h <<_ACEOF
@@ -18610,7 +18610,7 @@ exec 6>&1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by LLVM $as_me 3.7.0svn, which was
+This file was extended by LLVM $as_me 3.7.0, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -18663,7 +18663,7 @@ Report bugs to <bug-autoconf@gnu.org>."
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-LLVM config.status 3.7.0svn
+LLVM config.status 3.7.0
 configured by $0, generated by GNU Autoconf 2.60,
   with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
diff --git a/docs/Atomics.rst b/docs/Atomics.rst
index 6c8303b2830d..9068df46b023 100644
--- a/docs/Atomics.rst
+++ b/docs/Atomics.rst
@@ -173,7 +173,7 @@ Notes for code generation
   also expected to generate an i8 store as an i8 store, and not an instruction
   which writes to surrounding bytes.  (If you are writing a backend for an
   architecture which cannot satisfy these restrictions and cares about
-  concurrency, please send an email to llvmdev.)
+  concurrency, please send an email to llvm-dev.)
 
 Unordered
 ---------
diff --git a/docs/CMake.rst b/docs/CMake.rst
index b6dd83850ff2..909fc04248c7 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@@ -387,6 +387,10 @@ LLVM-specific variables
   ``-DLLVM_ENABLE_DOXYGEN_QT_HELP=ON``; otherwise this has no
   effect.
 
+**LLVM_DOXYGEN_SVG**:BOOL
+  Uses .svg files instead of .png files for graphs in the Doxygen output.
+  Defaults to OFF.
+
 **LLVM_ENABLE_SPHINX**:BOOL
   If enabled CMake will search for the ``sphinx-build`` executable and will make
   the ``SPHINX_OUTPUT_HTML`` and ``SPHINX_OUTPUT_MAN`` CMake options available.
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index da27627f07e9..2388a92d39ef 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -56,6 +56,14 @@ if (LLVM_ENABLE_DOXYGEN)
     set(llvm_doxygen_qhp_cust_filter_attrs "")
   endif()
   
+  option(LLVM_DOXYGEN_SVG
+    "Use svg instead of png files for doxygen graphs." OFF)
+  if (LLVM_DOXYGEN_SVG)
+    set(DOT_IMAGE_FORMAT "svg")
+  else()
+    set(DOT_IMAGE_FORMAT "png")
+  endif()
+
   configure_file(${CMAKE_CURRENT_SOURCE_DIR}/doxygen.cfg.in
     ${CMAKE_CURRENT_BINARY_DIR}/doxygen.cfg @ONLY)
 
@@ -73,6 +81,7 @@ if (LLVM_ENABLE_DOXYGEN)
   set(llvm_doxygen_qhelpgenerator_path)
   set(llvm_doxygen_qhp_cust_filter_name)
   set(llvm_doxygen_qhp_cust_filter_attrs)
+  set(DOT_IMAGE_FORMAT)
 
   add_custom_target(doxygen-llvm
     COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/doxygen.cfg
diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst
index 516031dd7bcc..03f5cbd726d8 100644
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst
@@ -1814,6 +1814,7 @@ Here is the table:
 :raw-html:`<th>SystemZ</th>`
 :raw-html:`<th>X86</th>`
 :raw-html:`<th>XCore</th>`
+:raw-html:`<th>eBPF</th>`
 :raw-html:`</tr>`
 
 :raw-html:`<tr>`
@@ -1828,6 +1829,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- SystemZ -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="yes"></td> <!-- XCore -->`
+:raw-html:`<td class="yes"></td> <!-- eBPF -->`
 :raw-html:`</tr>`
 
 :raw-html:`<tr>`
@@ -1842,6 +1844,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- SystemZ -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`<td class="no"></td> <!-- eBPF -->`
 :raw-html:`</tr>`
 
 :raw-html:`<tr>`
@@ -1856,6 +1859,7 @@ Here is the table:
 :raw-html:`<td class="no"></td> <!-- Sparc -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="yes"></td> <!-- XCore -->`
+:raw-html:`<td class="yes"></td> <!-- eBPF -->`
 :raw-html:`</tr>`
 
 :raw-html:`<tr>`
@@ -1870,6 +1874,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- SystemZ -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="yes"></td> <!-- XCore -->`
+:raw-html:`<td class="no"></td> <!-- eBPF -->`
 :raw-html:`</tr>`
 
 :raw-html:`<tr>`
@@ -1884,6 +1889,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- SystemZ -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`<td class="yes"></td> <!-- eBPF -->`
 :raw-html:`</tr>`
 
 :raw-html:`<tr>`
@@ -1898,6 +1904,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- SystemZ -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`<td class="yes"></td> <!-- eBPF -->`
 :raw-html:`</tr>`
 
 :raw-html:`<tr>`
@@ -1912,6 +1919,7 @@ Here is the table:
 :raw-html:`<td class="no"></td> <!-- SystemZ -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`<td class="no"></td> <!-- eBPF -->`
 :raw-html:`</tr>`
 
 :raw-html:`<tr>`
@@ -1926,6 +1934,7 @@ Here is the table:
 :raw-html:`<td class="no"></td> <!-- SystemZ -->`
 :raw-html:`<td class="partial"><a href="#feat_segstacks_x86">*</a></td> <!-- X86 -->`
 :raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`<td class="no"></td> <!-- eBPF -->`
 :raw-html:`</tr>`
 
 :raw-html:`</table>`
@@ -2448,3 +2457,191 @@ Code Generator Options:
 :raw-html:`</tr>`
 :raw-html:`</table>`
 
+The extended Berkeley Packet Filter (eBPF) backend
+--------------------------------------------------
+
+Extended BPF (or eBPF) is similar to the original ("classic") BPF (cBPF) used
+to filter network packets.  The
+`bpf() system call <http://man7.org/linux/man-pages/man2/bpf.2.html>`_
+performs a range of operations related to eBPF.  For both cBPF and eBPF
+programs, the Linux kernel statically analyzes the programs before loading
+them, in order to ensure that they cannot harm the running system.  eBPF is
+a 64-bit RISC instruction set designed for one to one mapping to 64-bit CPUs.
+Opcodes are 8-bit encoded, and 87 instructions are defined.  There are 10
+registers, grouped by function as outlined below.
+
+::
+
+  R0        return value from in-kernel functions; exit value for eBPF program
+  R1 - R5   function call arguments to in-kernel functions
+  R6 - R9   callee-saved registers preserved by in-kernel functions
+  R10       stack frame pointer (read only)
+
+Instruction encoding (arithmetic and jump)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+eBPF is reusing most of the opcode encoding from classic to simplify conversion
+of classic BPF to eBPF.  For arithmetic and jump instructions the 8-bit 'code'
+field is divided into three parts:
+
+::
+
+  +----------------+--------+--------------------+
+  |   4 bits       |  1 bit |   3 bits           |
+  | operation code | source | instruction class  |
+  +----------------+--------+--------------------+
+  (MSB)                                      (LSB)
+
+Three LSB bits store instruction class which is one of:
+
+::
+
+  BPF_LD     0x0
+  BPF_LDX    0x1
+  BPF_ST     0x2
+  BPF_STX    0x3
+  BPF_ALU    0x4
+  BPF_JMP    0x5
+  (unused)   0x6
+  BPF_ALU64  0x7
+
+When BPF_CLASS(code) == BPF_ALU or BPF_ALU64 or BPF_JMP,
+4th bit encodes source operand
+
+::
+
+  BPF_X     0x0  use src_reg register as source operand
+  BPF_K     0x1  use 32 bit immediate as source operand
+
+and four MSB bits store operation code
+
+::
+
+  BPF_ADD   0x0  add
+  BPF_SUB   0x1  subtract
+  BPF_MUL   0x2  multiply
+  BPF_DIV   0x3  divide
+  BPF_OR    0x4  bitwise logical OR
+  BPF_AND   0x5  bitwise logical AND
+  BPF_LSH   0x6  left shift
+  BPF_RSH   0x7  right shift (zero extended)
+  BPF_NEG   0x8  arithmetic negation
+  BPF_MOD   0x9  modulo
+  BPF_XOR   0xa  bitwise logical XOR
+  BPF_MOV   0xb  move register to register
+  BPF_ARSH  0xc  right shift (sign extended)
+  BPF_END   0xd  endianness conversion
+
+If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of
+
+::
+
+  BPF_JA    0x0  unconditional jump
+  BPF_JEQ   0x1  jump ==
+  BPF_JGT   0x2  jump >
+  BPF_JGE   0x3  jump >=
+  BPF_JSET  0x4  jump if (DST & SRC)
+  BPF_JNE   0x5  jump !=
+  BPF_JSGT  0x6  jump signed >
+  BPF_JSGE  0x7  jump signed >=
+  BPF_CALL  0x8  function call
+  BPF_EXIT  0x9  function return
+
+Instruction encoding (load, store)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+For load and store instructions the 8-bit 'code' field is divided as:
+
+::
+
+  +--------+--------+-------------------+
+  | 3 bits | 2 bits |   3 bits          |
+  |  mode  |  size  | instruction class |
+  +--------+--------+-------------------+
+  (MSB)                             (LSB)
+
+Size modifier is one of
+
+::
+
+  BPF_W       0x0  word
+  BPF_H       0x1  half word
+  BPF_B       0x2  byte
+  BPF_DW      0x3  double word
+
+Mode modifier is one of
+
+::
+
+  BPF_IMM     0x0  immediate
+  BPF_ABS     0x1  used to access packet data
+  BPF_IND     0x2  used to access packet data
+  BPF_MEM     0x3  memory
+  (reserved)  0x4
+  (reserved)  0x5
+  BPF_XADD    0x6  exclusive add
+
+
+Packet data access (BPF_ABS, BPF_IND)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Two non-generic instructions: (BPF_ABS | <size> | BPF_LD) and
+(BPF_IND | <size> | BPF_LD) which are used to access packet data.
+Register R6 is an implicit input that must contain pointer to sk_buff.
+Register R0 is an implicit output which contains the data fetched
+from the packet.  Registers R1-R5 are scratch registers and must not
+be used to store the data across BPF_ABS | BPF_LD or BPF_IND | BPF_LD
+instructions.  These instructions have implicit program exit condition
+as well.  When eBPF program is trying to access the data beyond
+the packet boundary, the interpreter will abort the execution of the program.
+
+BPF_IND | BPF_W | BPF_LD is equivalent to:
+  R0 = ntohl(\*(u32 \*) (((struct sk_buff \*) R6)->data + src_reg + imm32))
+
+eBPF maps
+^^^^^^^^^
+
+eBPF maps are provided for sharing data between kernel and user-space.
+Currently implemented types are hash and array, with potential extension to
+support bloom filters, radix trees, etc.  A map is defined by its type,
+maximum number of elements, key size and value size in bytes.  eBPF syscall
+supports create, update, find and delete functions on maps.
+
+Function calls
+^^^^^^^^^^^^^^
+
+Function call arguments are passed using up to five registers (R1 - R5).
+The return value is passed in a dedicated register (R0).  Four additional
+registers (R6 - R9) are callee-saved, and the values in these registers
+are preserved within kernel functions.  R0 - R5 are scratch registers within
+kernel functions, and eBPF programs must therefor store/restore values in
+these registers if needed across function calls.  The stack can be accessed
+using the read-only frame pointer R10.  eBPF registers map 1:1 to hardware
+registers on x86_64 and other 64-bit architectures.  For example, x86_64
+in-kernel JIT maps them as
+
+::
+
+  R0 - rax
+  R1 - rdi
+  R2 - rsi
+  R3 - rdx
+  R4 - rcx
+  R5 - r8
+  R6 - rbx
+  R7 - r13
+  R8 - r14
+  R9 - r15
+  R10 - rbp
+
+since x86_64 ABI mandates rdi, rsi, rdx, rcx, r8, r9 for argument passing
+and rbx, r12 - r15 are callee saved.
+
+Program start
+^^^^^^^^^^^^^
+
+An eBPF program receives a single argument and contains
+a single eBPF main routine; the program does not contain eBPF functions.
+Function calls are limited to a predefined set of kernel functions.  The size
+of a program is limited to 4K instructions:  this ensures fast termination and
+a limited number of kernel function calls.  Prior to running an eBPF program,
+a verifier performs static analysis to prevent loops in the code and
+to ensure valid register usage and operand types.
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index 498d76b04d8a..de4f73c546b5 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@@ -28,7 +28,7 @@ Note that some code bases (e.g. ``libc++``) have really good reasons to deviate
 from the coding standards.  In the case of ``libc++``, this is because the
 naming and other conventions are dictated by the C++ standard.  If you think
 there is a specific good reason to deviate from the standards here, please bring
-it up on the LLVMdev mailing list.
+it up on the LLVM-dev mailing list.
 
 There are some conventions that are not uniformly followed in the code base
 (e.g. the naming convention).  This is because they are relatively new, and a
diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index f090c6d56545..9e458559fbcd 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -30,7 +30,7 @@ This policy is also designed to accomplish the following objectives:
 This policy is aimed at frequent contributors to LLVM. People interested in
 contributing one-off patches can do so in an informal way by sending them to the
 `llvm-commits mailing list
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_ and engaging another
+<http://lists.llvm.org/mailman/listinfo/llvm-commits>`_ and engaging another
 developer to see it through the process.
 
 Developer Policies
@@ -47,23 +47,23 @@ Stay Informed
 -------------
 
 Developers should stay informed by reading at least the "dev" mailing list for
-the projects you are interested in, such as `llvmdev
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ for LLVM, `cfe-dev
-<http://lists.cs.uiuc.edu/mailman/listinfo/cfe-dev>`_ for Clang, or `lldb-dev
-<http://lists.cs.uiuc.edu/mailman/listinfo/lldb-dev>`_ for LLDB.  If you are
+the projects you are interested in, such as `llvm-dev
+<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ for LLVM, `cfe-dev
+<http://lists.llvm.org/mailman/listinfo/cfe-dev>`_ for Clang, or `lldb-dev
+<http://lists.llvm.org/mailman/listinfo/lldb-dev>`_ for LLDB.  If you are
 doing anything more than just casual work on LLVM, it is suggested that you also
 subscribe to the "commits" mailing list for the subproject you're interested in,
 such as `llvm-commits
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_, `cfe-commits
-<http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits>`_, or `lldb-commits
-<http://lists.cs.uiuc.edu/mailman/listinfo/lldb-commits>`_.  Reading the
+<http://lists.llvm.org/mailman/listinfo/llvm-commits>`_, `cfe-commits
+<http://lists.llvm.org/mailman/listinfo/cfe-commits>`_, or `lldb-commits
+<http://lists.llvm.org/mailman/listinfo/lldb-commits>`_.  Reading the
 "commits" list and paying attention to changes being made by others is a good
 way to see what other people are interested in and watching the flow of the
 project as a whole.
 
 We recommend that active developers register an email account with `LLVM
 Bugzilla <http://llvm.org/bugs/>`_ and preferably subscribe to the `llvm-bugs
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvmbugs>`_ email list to keep track
+<http://lists.llvm.org/mailman/listinfo/llvm-bugs>`_ email list to keep track
 of bugs and enhancements occurring in LLVM.  We really appreciate people who are
 proactive at catching incoming bugs in their components and dealing with them
 promptly.
@@ -365,7 +365,7 @@ If you have recently been granted commit access, these policies apply:
 
 #. You are granted *commit-after-approval* to all parts of LLVM.  To get
    approval, submit a `patch`_ to `llvm-commits
-   <http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_. When approved,
+   <http://lists.llvm.org/mailman/listinfo/llvm-commits>`_. When approved,
    you may commit it yourself.
 
 #. You are allowed to commit patches without approval which you think are
@@ -394,8 +394,8 @@ Making a Major Change
 ---------------------
 
 When a developer begins a major new project with the aim of contributing it back
-to LLVM, they should inform the community with an email to the `llvmdev
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ email list, to the extent
+to LLVM, they should inform the community with an email to the `llvm-dev
+<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ email list, to the extent
 possible. The reason for this is to:
 
 #. keep the community informed about future changes to LLVM,
@@ -608,7 +608,7 @@ LICENSE.txt files specifically indicate that they contain GPL code.
 
 We have no plans to change the license of LLVM.  If you have questions or
 comments about the license, please contact the `LLVM Developer's Mailing
-List <mailto:llvmdev@cs.uiuc.edu>`_.
+List <mailto:llvm-dev@lists.llvm.org>`_.
 
 Patents
 -------
diff --git a/docs/ExtendingLLVM.rst b/docs/ExtendingLLVM.rst
index 56c48af1ff35..3fd54c8360e5 100644
--- a/docs/ExtendingLLVM.rst
+++ b/docs/ExtendingLLVM.rst
@@ -15,7 +15,7 @@ When you come to this realization, stop and think. Do you really need to extend
 LLVM? Is it a new fundamental capability that LLVM does not support at its
 current incarnation or can it be synthesized from already pre-existing LLVM
 elements? If you are not sure, ask on the `LLVM-dev
-<http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ list. The reason is that
+<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ list. The reason is that
 extending LLVM will get involved as you need to update all the different passes
 that you intend to use with your extension, and there are ``many`` LLVM analyses
 and transformations, so it may be quite a bit of work.
diff --git a/docs/Frontend/PerformanceTips.rst b/docs/Frontend/PerformanceTips.rst
index 22b3fe45984e..8d0abcd1c172 100644
--- a/docs/Frontend/PerformanceTips.rst
+++ b/docs/Frontend/PerformanceTips.rst
@@ -174,10 +174,10 @@ Adding to this document
 
 If you run across a case that you feel deserves to be covered here, please send
 a patch to `llvm-commits
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_ for review.
+<http://lists.llvm.org/mailman/listinfo/llvm-commits>`_ for review.
 
-If you have questions on these items, please direct them to `llvmdev 
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_.  The more relevant 
+If you have questions on these items, please direct them to `llvm-dev 
+<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_.  The more relevant 
 context you are able to give to your question, the more likely it is to be 
 answered.
 
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 75f0e60c41f3..df6bd7bc6ba8 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -714,9 +714,9 @@ used by people developing LLVM.
 |                         | the configure script. The default list is defined  |
 |                         | as ``LLVM_ALL_TARGETS``, and can be set to include |
 |                         | out-of-tree targets. The default value includes:   |
-|                         | ``AArch64, ARM, CppBackend, Hexagon,               |
-|                         | Mips, MSP430, NVPTX, PowerPC, AMDGPU, Sparc,       |
-|                         | SystemZ, X86, XCore``.                             |
+|                         | ``AArch64, AMDGPU, ARM, BPF, CppBackend, Hexagon,  |
+|                         | Mips, MSP430, NVPTX, PowerPC, Sparc, SystemZ       |
+|                         | X86, XCore``.                                      |
 +-------------------------+----------------------------------------------------+
 | LLVM_ENABLE_DOXYGEN     | Build doxygen-based documentation from the source  |
 |                         | code This is disabled by default because it is     |
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index e7d6f67c9399..0039d014275a 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -6493,7 +6493,7 @@ Example:
 
       %ptr = alloca i32                               ; yields i32*:ptr
       store i32 3, i32* %ptr                          ; yields void
-      %val = load i32* %ptr                           ; yields i32:val = i32 3
+      %val = load i32, i32* %ptr                      ; yields i32:val = i32 3
 
 .. _i_fence:
 
diff --git a/docs/Makefile b/docs/Makefile
index c9d2477c0af2..da649bc88732 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -31,6 +31,7 @@ $(PROJ_OBJ_DIR)/doxygen.cfg: doxygen.cfg.in
 	  -e 's/@llvm_doxygen_qhp_cust_filter_name@//g' \
 	  -e 's/@llvm_doxygen_qhp_namespace@//g' \
 	  -e 's/@searchengine_url@//g' \
+	  -e 's/@DOT_IMAGE_FORMAT@/png/g' \
 	  > $@
 endif
 
diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst
index 1dcd6a0859d1..3426bfff164f 100644
--- a/docs/Phabricator.rst
+++ b/docs/Phabricator.rst
@@ -150,7 +150,7 @@ Status
 
 Please let us know whether you like it and what could be improved! We're still
 working on setting up a bug tracker, but you can email klimek-at-google-dot-com
-and chandlerc-at-gmail-dot-com and CC the llvmdev mailing list with questions
+and chandlerc-at-gmail-dot-com and CC the llvm-dev mailing list with questions
 until then. We also could use help implementing improvements. This sadly is
 really painful and hard because the Phabricator codebase is in PHP and not as
 testable as you might like. However, we've put exactly what we're deploying up
diff --git a/docs/Projects.rst b/docs/Projects.rst
index 095b87a65a10..46956642536b 100644
--- a/docs/Projects.rst
+++ b/docs/Projects.rst
@@ -254,4 +254,4 @@ Further Help
 If you have any questions or need any help creating an LLVM project, the LLVM
 team would be more than happy to help.  You can always post your questions to
 the `LLVM Developers Mailing List
-<http://lists.cs.uiuc.edu/pipermail/llvmdev/>`_.
+<http://lists.llvm.org/pipermail/llvm-dev/>`_.
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index c0d2ea18981e..fd149c97e44c 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -5,12 +5,6 @@ LLVM 3.7 Release Notes
 .. contents::
     :local:
 
-.. warning::
-   These are in-progress notes for the upcoming LLVM 3.7 release.  You may
-   prefer the `LLVM 3.6 Release Notes <http://llvm.org/releases/3.6.0/docs
-   /ReleaseNotes.html>`_.
-
-
 Introduction
 ============
 
@@ -23,7 +17,7 @@ from the `LLVM releases web site <http://llvm.org/releases/>`_.
 For more information about LLVM, including information about the latest
 release, please check out the `main LLVM web site <http://llvm.org/>`_.  If you
 have questions or comments, the `LLVM Developer's Mailing List
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ is a good place to send
+<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
 them.
 
 Note that if you are reading this file from a Subversion checkout or the main
@@ -48,46 +42,346 @@ Non-comprehensive list of changes in this release
   collection of tips for frontend authors on how to generate IR which LLVM is
   able to effectively optimize.
 
-* The DataLayout is no longer optional. All the IR level optimizations expects
+* The ``DataLayout`` is no longer optional. All the IR level optimizations expects
   it to be present and the API has been changed to use a reference instead of
   a pointer to make it explicit. The Module owns the datalayout and it has to
   match the one attached to the TargetMachine for generating code.
 
-* ... next change ...
+  In 3.6, a pass was inserted in the pipeline to make the ``DataLayout`` accessible:
+    ``MyPassManager->add(new DataLayoutPass(MyTargetMachine->getDataLayout()));``
+  In 3.7, you don't need a pass, you set the ``DataLayout`` on the ``Module``:
+    ``MyModule->setDataLayout(MyTargetMachine->createDataLayout());``
 
-.. NOTE
-   If you would like to document a larger change, then you can add a
-   subsection about it right here. You can copy the following boilerplate
-   and un-indent it (the indentation causes it to be inside this comment).
+  The LLVM C API ``LLVMGetTargetMachineData`` is deprecated to reflect the fact
+  that it won't be available anymore from ``TargetMachine`` in 3.8.
 
-   Special New Feature
-   -------------------
+* Comdats are now orthogonal to the linkage. LLVM will not create
+  comdats for weak linkage globals and the frontends are responsible
+  for explicitly adding them.
 
-   Makes programs 10x faster by doing Special New Thing.
+* On ELF we now support multiple sections with the same name and
+  comdat. This allows for smaller object files since multiple
+  sections can have a simple name (`.text`, `.rodata`, etc).
 
-Changes to the ARM Backend
---------------------------
+* LLVM now lazily loads metadata in some cases. Creating archives
+  with IR files with debug info is now 25X faster.
+
+* llvm-ar can create archives in the BSD format used by OS X.
+
+* LLVM received a backend for the extended Berkely Packet Filter
+  instruction set that can be dynamically loaded into the Linux kernel via the
+  `bpf(2) <http://man7.org/linux/man-pages/man2/bpf.2.html>`_ syscall.
+
+  Support for BPF has been present in the kernel for some time, but starting
+  from 3.18 has been extended with such features as: 64-bit registers, 8
+  additional registers registers, conditional backwards jumps, call
+  instruction, shift instructions, map (hash table, array, etc.), 1-8 byte
+  load/store from stack, and more.
 
- During this release ...
+  Up until now, users of BPF had to write bytecode by hand, or use
+  custom generators. This release adds a proper LLVM backend target for the BPF
+  bytecode architecture.
 
+  The BPF target is now available by default, and options exist in both Clang
+  (-target bpf) or llc (-march=bpf) to pick eBPF as a backend.
+
+* Switch-case lowering was rewritten to avoid generating unbalanced search trees
+  (`PR22262 <http://llvm.org/pr22262>`_) and to exploit profile information
+  when available. Some lowering strategies are now disabled when optimizations
+  are turned off, to save compile time.
+
+* The debug info IR class hierarchy now inherits from ``Metadata`` and has its
+  own bitcode records and assembly syntax
+  (`documented in LangRef <LangRef.html#specialized-metadata-nodes>`_).  The debug
+  info verifier has been merged with the main verifier.
+
+* LLVM IR and APIs are in a period of transition to aid in the removal of
+  pointer types (the end goal being that pointers are typeless/opaque - void*,
+  if you will). Some APIs and IR constructs have been modified to take
+  explicit types that are currently checked to match the target type of their
+  pre-existing pointer type operands. Further changes are still needed, but the
+  more you can avoid using ``PointerType::getPointeeType``, the easier the
+  migration will be.
+
+* Argument-less ``TargetMachine::getSubtarget`` and
+  ``TargetMachine::getSubtargetImpl`` have been removed from the tree. Updating
+  out of tree ports is as simple as implementing a non-virtual version in the
+  target, but implementing full ``Function`` based ``TargetSubtargetInfo``
+  support is recommended.
+
+* This is expected to be the last major release of LLVM that supports being
+  run on Windows XP and Windows Vista.  For the next major release the minimum
+  Windows version requirement will be Windows 7.
 
 Changes to the MIPS Target
 --------------------------
 
- During this release ...
+During this release the MIPS target has:
+
+* Added support for MIPS32R3, MIPS32R5, MIPS32R3, MIPS32R5, and microMIPS32.
+
+* Added support for dynamic stack realignment. This is of particular importance
+  to MSA on 32-bit subtargets since vectors always exceed the stack alignment on
+  the O32 ABI.
+
+* Added support for compiler-rt including:
+
+  * Support for the Address, and Undefined Behaviour Sanitizers for all MIPS
+    subtargets.
+
+  * Support for the Data Flow, and Memory Sanitizer for 64-bit subtargets.
+
+  * Support for the Profiler for all MIPS subtargets.
+
+* Added support for libcxx, and libcxxabi.
+
+* Improved inline assembly support such that memory constraints may now make use
+  of the appropriate address offsets available to the instructions. Also, added
+  support for the ``ZC`` constraint.
+
+* Added support for 128-bit integers on 64-bit subtargets and 16-bit floating
+  point conversions on all subtargets.
+
+* Added support for read-only ``.eh_frame`` sections by storing type information
+  indirectly.
+
+* Added support for MCJIT on all 64-bit subtargets as well as MIPS32R6.
+
+* Added support for fast instruction selection on MIPS32 and MIPS32R2 with PIC.
+
+* Various bug fixes. Including the following notable fixes:
 
+  * Fixed 'jumpy' debug line info around calls where calculation of the address
+    of the function would inappropriately change the line number.
+
+  * Fixed missing ``__mips_isa_rev`` macro on the MIPS32R6 and MIPS32R6
+    subtargets.
+
+  * Fixed representation of NaN when targeting systems using traditional
+    encodings. Traditionally, MIPS has used NaN encodings that were compatible
+    with IEEE754-1985 but would later be found incompatible with IEEE754-2008.
+
+  * Fixed multiple segfaults and assertions in the disassembler when
+    disassembling instructions that have memory operands.
+
+  * Fixed multiple cases of suboptimal code generation involving $zero.
+
+  * Fixed code generation of 128-bit shifts on 64-bit subtargets.
+
+  * Prevented the delay slot filler from filling call delay slots with
+    instructions that modify or use $ra.
+
+  * Fixed some remaining N32/N64 calling convention bugs when using small
+    structures on big-endian subtargets.
+
+  * Fixed missing sign-extensions that are required by the N32/N64 calling
+    convention when generating calls to library functions with 32-bit
+    parameters.
+
+  * Corrected the ``int64_t`` typedef to be ``long`` for N64.
+
+  * ``-mno-odd-spreg`` is now honoured for vector insertion/extraction
+    operations when using -mmsa.
+
+  * Fixed vector insertion and extraction for MSA on 64-bit subtargets.
+
+  * Corrected the representation of member function pointers. This makes them
+    usable on microMIPS subtargets.
 
 Changes to the PowerPC Target
 -----------------------------
 
- During this release ...
+There are numerous improvements to the PowerPC target in this release:
+
+* LLVM now supports the ISA 2.07B (POWER8) instruction set, including
+  direct moves between general registers and vector registers, and
+  built-in support for hardware transactional memory (HTM).  Some missing
+  instructions from ISA 2.06 (POWER7) were also added.
+
+* Code generation for the local-dynamic and global-dynamic thread-local
+  storage models has been improved.
+
+* Loops may be restructured to leverage pre-increment loads and stores.
+
+* QPX - The vector instruction set used by the IBM Blue Gene/Q supercomputers
+  is now supported.
+
+* Loads from the TOC area are now correctly treated as invariant.
+
+* PowerPC now has support for i128 and v1i128 types.  The types differ
+  in how they are passed in registers for the ELFv2 ABI.
+
+* Disassembly will now print shorter mnemonic aliases when available.
+
+* Optional register name prefixes for VSX and QPX registers are now
+  supported in the assembly parser.
+
+* The back end now contains a pass to remove unnecessary vector swaps
+  from POWER8 little-endian code generation.  Additional improvements
+  are planned for release 3.8.
+
+* The undefined-behavior sanitizer (UBSan) is now supported for PowerPC.
+
+* Many new vector programming APIs have been added to altivec.h.
+  Additional ones are planned for release 3.8.
+
+* PowerPC now supports __builtin_call_with_static_chain.
+
+* PowerPC now supports the revised -mrecip option that permits finer
+  control over reciprocal estimates.
 
+* Many bugs have been identified and fixed.
 
-Changes to the OCaml bindings
+Changes to the SystemZ Target
 -----------------------------
 
- During this release ...
+* LLVM no longer attempts to automatically detect the current host CPU when
+  invoked natively.
 
+* Support for all thread-local storage models. (Previous releases would support
+  only the local-exec TLS model.)
+
+* The POPCNT instruction is now used on z196 and above.
+
+* The RISBGN instruction is now used on zEC12 and above.
+
+* Support for the transactional-execution facility on zEC12 and above.
+
+* Support for the z13 processor and its vector facility.
+
+
+Changes to the JIT APIs
+-----------------------
+
+* Added a new C++ JIT API called On Request Compilation, or ORC.
+
+  ORC is a new JIT API inspired by MCJIT but designed to be more testable, and
+  easier to extend with new features. A key new feature already in tree is lazy,
+  function-at-a-time compilation for X86. Also included is a reimplementation of
+  MCJIT's API and behavior (OrcMCJITReplacement). MCJIT itself remains in tree,
+  and continues to be the default JIT ExecutionEngine, though new users are
+  encouraged to try ORC out for their projects. (A good place to start is the
+  new ORC tutorials under llvm/examples/kaleidoscope/orc).
+
+Sub-project Status Update
+=========================
+
+In addition to the core LLVM 3.7 distribution of production-quality compiler
+infrastructure, the LLVM project includes sub-projects that use the LLVM core
+and share the same distribution license. This section provides updates on these
+sub-projects.
+
+Polly - The Polyhedral Loop Optimizer in LLVM
+---------------------------------------------
+
+`Polly <http://polly.llvm.org>`_ is a polyhedral loop optimization
+infrastructure that provides data-locality optimizations to LLVM-based
+compilers. When compiled as part of clang or loaded as a module into clang,
+it can perform loop optimizations such as tiling, loop fusion or outer-loop
+vectorization. As a generic loop optimization infrastructure it allows
+developers to get a per-loop-iteration model of a loop nest on which detailed
+analysis and transformations can be performed.
+
+Changes since the last release:
+
+* isl imported into Polly distribution
+
+  `isl <http://repo.or.cz/w/isl.git>`_, the math library Polly uses, has been
+  imported into the source code repository of Polly and is now distributed as part
+  of Polly. As this was the last external library dependency of Polly, Polly can
+  now be compiled right after checking out the Polly source code without the need
+  for any additional libraries to be pre-installed.
+
+* Small integer optimization of isl
+
+  The MIT licensed imath backend using in `isl <http://repo.or.cz/w/isl.git>`_ for
+  arbitrary width integer computations has been optimized to use native integer
+  operations for the common case where the operands of a computation fit into 32
+  bit and to only fall back to large arbitrary precision integers for the
+  remaining cases. This optimization has greatly improved the compile-time
+  performance of Polly, both due to faster native operations also due to a
+  reduction in malloc traffic and pointer indirections. As a result, computations
+  that use arbitrary precision integers heavily have been speed up by almost 6x.
+  As a result, the compile-time of Polly on the Polybench test kernels in the LNT
+  suite has been reduced by 20% on average with compile time reductions between
+  9-43%.
+
+* Schedule Trees
+
+  Polly now uses internally so-called > Schedule Trees < to model the loop
+  structure it optimizes. Schedule trees are an easy to understand tree structure
+  that describes a loop nest using integer constraint sets to keep track of
+  execution constraints. It allows the developer to use per-tree-node operations
+  to modify the loop tree. Programatic analysis that work on the schedule tree
+  (e.g., as dependence analysis) also show a visible speedup as they can exploit
+  the tree structure of the schedule and need to fall back to ILP based
+  optimization problems less often. Section 6 of `Polyhedral AST generation is
+  more than scanning polyhedra
+  <http://www.grosser.es/#pub-polyhedral-AST-generation>`_ gives a detailed
+  explanation of this schedule trees.
+
+* Scalar and PHI node modeling - Polly as an analysis
+
+  Polly now requires almost no preprocessing to analyse LLVM-IR, which makes it
+  easier to use Polly as a pure analysis pass e.g. to provide more precise
+  dependence information to non-polyhedral transformation passes. Originally,
+  Polly required the input LLVM-IR to be preprocessed such that all scalar and
+  PHI-node dependences are translated to in-memory operations. Since this release,
+  Polly has full support for scalar and PHI node dependences and requires no
+  scalar-to-memory translation for such kind of dependences.
+
+* Modeling of modulo and non-affine conditions
+
+  Polly can now supports modulo operations such as A[t%2][i][j] as they appear
+  often in stencil computations and also allows data-dependent conditional
+  branches as they result e.g. from ternary conditions ala A[i] > 255 ? 255 :
+  A[i].
+
+* Delinearization
+
+  Polly now support the analysis of manually linearized multi-dimensional arrays
+  as they result form macros such as
+  "#define 2DARRAY(A,i,j) (A.data[(i) * A.size + (j)]". Similar constructs appear
+  in old C code written before C99, C++ code such as boost::ublas, LLVM exported
+  from Julia, Matlab generated code and many others. Our work titled
+  `Optimistic Delinearization of Parametrically Sized Arrays
+  <http://www.grosser.es/#pub-optimistic-delinerization>`_ gives details.
+
+* Compile time improvements
+
+  Pratik Bahtu worked on compile-time performance tuning of Polly. His work
+  together with the support for schedule trees and the small integer optimization
+  in isl notably reduced the compile time.
+
+* Increased compute timeouts
+
+  As Polly's compile time has been notabily improved, we were able to increase
+  the compile time saveguards in Polly. As a result, the default configuration
+  of Polly can now analyze larger loop nests without running into compile time
+  restrictions.
+
+* Export Debug Locations via JSCoP file
+
+  Polly's JSCoP import/export format gained support for debug locations that show
+  to the user the source code location of detected scops.
+
+* Improved windows support
+
+  The compilation of Polly on windows using cmake has been improved and several
+  visual studio build issues have been addressed.
+
+* Many bug fixes
+
+libunwind
+---------
+
+The unwind implementation which use to reside in `libc++abi` has been moved into
+a separate repository.  This implementation can still be used for `libc++abi` by
+specifying `-DLIBCXXABI_USE_LLVM_UNWINDER=YES` and
+`-DLIBCXXABI_LIBUNWIND_PATH=<path to libunwind source>` when configuring
+`libc++abi`, which defaults to `true` when building on ARM.
+
+The new repository can also be built standalone if just `libunwind` is desired.
 
 External Open Source Projects Using LLVM 3.7
 ============================================
@@ -96,7 +390,74 @@ An exciting aspect of LLVM is that it is used as an enabling technology for
 a lot of other language and tools projects. This section lists some of the
 projects that have already been updated to work with LLVM 3.7.
 
-* A project
+
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on
+PowerPC (32/64 bit). Ports to other architectures like ARM, AArch64 and MIPS64
+are underway.
+
+Portable Computing Language (pocl)
+----------------------------------
+
+In addition to producing an easily portable open source OpenCL
+implementation, another major goal of `pocl <http://portablecl.org/>`_
+is improving performance portability of OpenCL programs with
+compiler optimizations, reducing the need for target-dependent manual
+optimizations. An important part of pocl is a set of LLVM passes used to
+statically parallelize multiple work-items with the kernel compiler, even in
+the presence of work-group barriers.
+
+
+TTA-based Co-design Environment (TCE)
+-------------------------------------
+
+`TCE <http://tce.cs.tut.fi/>`_ is a toolset for designing customized
+exposed datapath processors based on the Transport triggered
+architecture (TTA).
+
+The toolset provides a complete co-design flow from C/C++
+programs down to synthesizable VHDL/Verilog and parallel program binaries.
+Processor customization points include the register files, function units,
+supported operations, and the interconnection network.
+
+TCE uses Clang and LLVM for C/C++/OpenCL C language support, target independent
+optimizations and also for parts of code generation. It generates
+new LLVM-based code generators "on the fly" for the designed processors and
+loads them in to the compiler backend as runtime libraries to avoid
+per-target recompilation of larger parts of the compiler chain.
+
+BPF Compiler Collection (BCC)
+-----------------------------
+`BCC <https://github.com/iovisor/bcc>`_ is a Python + C framework for tracing and
+networking that is using Clang rewriter + 2nd pass of Clang + BPF backend to
+generate eBPF and push it into the kernel.
+
+LLVMSharp & ClangSharp
+----------------------
+
+`LLVMSharp <http://www.llvmsharp.org>`_ and
+`ClangSharp <http://www.clangsharp.org>`_ are type-safe C# bindings for
+Microsoft.NET and Mono that Platform Invoke into the native libraries.
+ClangSharp is self-hosted and is used to generated LLVMSharp using the
+LLVM-C API.
+
+`LLVMSharp Kaleidoscope Tutorials <http://www.llvmsharp.org/Kaleidoscope/>`_
+are instructive examples of writing a compiler in C#, with certain improvements
+like using the visitor pattern to generate LLVM IR.
+
+`ClangSharp PInvoke Generator <http://www.clangsharp.org/PInvoke/>`_ is the
+self-hosting mechanism for LLVM/ClangSharp and is demonstrative of using
+LibClang to generate Platform Invoke (PInvoke) signatures for C APIs.
 
 
 Additional Information
@@ -111,4 +472,3 @@ going into the ``llvm/docs/`` directory in the LLVM tree.
 
 If you have any questions or comments about LLVM, please feel free to contact
 us via the `mailing lists <http://llvm.org/docs/#maillist>`_.
-
diff --git a/docs/Statepoints.rst b/docs/Statepoints.rst
index b1d1ed8dc10b..eb5866eb552f 100644
--- a/docs/Statepoints.rst
+++ b/docs/Statepoints.rst
@@ -565,7 +565,7 @@ The existing IR Verifier pass has been extended to check most of the
 local restrictions on the intrinsics mentioned in their respective
 documentation.  The current implementation in LLVM does not check the
 key relocation invariant, but this is ongoing work on developing such
-a verifier.  Please ask on llvmdev if you're interested in
+a verifier.  Please ask on llvm-dev if you're interested in
 experimenting with the current version.
 
 .. _statepoint-utilities:
@@ -696,7 +696,7 @@ If you are scheduling the RewriteStatepointsForGC pass late in the pass order,
 you should probably schedule this pass immediately before it.  The exception 
 would be if you need to preserve abstract frame information (e.g. for
 deoptimization or introspection) at safepoints.  In that case, ask on the 
-llvmdev mailing list for suggestions.
+llvm-dev mailing list for suggestions.
 
 
 Bugs and Enhancements
@@ -707,8 +707,8 @@ tracked by performing a `bugzilla search
 <http://llvm.org/bugs/buglist.cgi?cmdtype=runnamed&namedcmd=Statepoint%20Bugs&list_id=64342>`_
 for [Statepoint] in the summary field. When filing new bugs, please
 use this tag so that interested parties see the newly filed bug.  As
-with most LLVM features, design discussions take place on `llvmdev
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_, and patches
+with most LLVM features, design discussions take place on `llvm-dev
+<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_, and patches
 should be sent to `llvm-commits
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_ for review.
+<http://lists.llvm.org/mailman/listinfo/llvm-commits>`_ for review.
 
diff --git a/docs/TableGen/LangIntro.rst b/docs/TableGen/LangIntro.rst
index 4d4551e8a6e7..a148634e3edf 100644
--- a/docs/TableGen/LangIntro.rst
+++ b/docs/TableGen/LangIntro.rst
@@ -7,7 +7,7 @@ TableGen Language Introduction
 
 .. warning::
    This document is extremely rough. If you find something lacking, please
-   fix it, file a documentation bug, or ask about it on llvmdev.
+   fix it, file a documentation bug, or ask about it on llvm-dev.
 
 Introduction
 ============
diff --git a/docs/TableGen/LangRef.rst b/docs/TableGen/LangRef.rst
index 134afedbb7b4..27b2c8beaa69 100644
--- a/docs/TableGen/LangRef.rst
+++ b/docs/TableGen/LangRef.rst
@@ -7,7 +7,7 @@ TableGen Language Reference
 
 .. warning::
    This document is extremely rough. If you find something lacking, please
-   fix it, file a documentation bug, or ask about it on llvmdev.
+   fix it, file a documentation bug, or ask about it on llvm-dev.
 
 Introduction
 ============
diff --git a/docs/conf.py b/docs/conf.py
index 18972824dbb4..27919c20a7a5 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -11,6 +11,7 @@
 # serve to show the default.
 
 import sys, os
+from datetime import date
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
@@ -40,7 +41,7 @@ master_doc = 'index'
 
 # General information about the project.
 project = u'LLVM'
-copyright = u'2003-2014, LLVM Project'
+copyright = u'2003-%d, LLVM Project' % date.today().year
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
diff --git a/docs/doxygen.cfg.in b/docs/doxygen.cfg.in
index d8c4051e5637..5c70db0332d5 100644
--- a/docs/doxygen.cfg.in
+++ b/docs/doxygen.cfg.in
@@ -2205,7 +2205,7 @@ DIRECTORY_GRAPH        = YES
 # The default value is: png.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_IMAGE_FORMAT       = png
+DOT_IMAGE_FORMAT       = @DOT_IMAGE_FORMAT@
 
 # If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
 # enable generation of interactive SVG images that allow zooming and panning.
diff --git a/docs/index.rst b/docs/index.rst
index 5c04a4e3fc96..66c55758c4db 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,11 +1,6 @@
 Overview
 ========
 
-.. warning::
-
-   If you are using a released version of LLVM, see `the download page
-   <http://llvm.org/releases/>`_ to find your documentation.
-
 The LLVM compiler infrastructure supports a wide range of projects, from
 industrial strength compilers to specialized JIT applications to small
 research projects.
@@ -425,12 +420,12 @@ Mailing Lists
 If you can't find what you need in these docs, try consulting the mailing
 lists.
 
-`Developer's List (llvmdev)`__
+`Developer's List (llvm-dev)`__
   This list is for people who want to be included in technical discussions of
   LLVM. People post to this list when they have questions about writing code
   for or using the LLVM tools. It is relatively low volume.
 
-  .. __: http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
+  .. __: http://lists.llvm.org/mailman/listinfo/llvm-dev
 
 `Commits Archive (llvm-commits)`__
   This list contains all commit messages that are made when LLVM developers
@@ -439,26 +434,26 @@ lists.
   stay on the bleeding edge of LLVM development. This list is very high
   volume.
 
-  .. __: http://lists.cs.uiuc.edu/pipermail/llvm-commits/
+  .. __: http://lists.llvm.org/pipermail/llvm-commits/
 
-`Bugs & Patches Archive (llvmbugs)`__
+`Bugs & Patches Archive (llvm-bugs)`__
   This list gets emailed every time a bug is opened and closed. It is
-  higher volume than the LLVMdev list.
+  higher volume than the LLVM-dev list.
 
-  .. __: http://lists.cs.uiuc.edu/pipermail/llvmbugs/
+  .. __: http://lists.llvm.org/pipermail/llvm-bugs/
 
 `Test Results Archive (llvm-testresults)`__
   A message is automatically sent to this list by every active nightly tester
   when it completes.  As such, this list gets email several times each day,
   making it a high volume list.
 
-  .. __: http://lists.cs.uiuc.edu/pipermail/llvm-testresults/
+  .. __: http://lists.llvm.org/pipermail/llvm-testresults/
 
 `LLVM Announcements List (llvm-announce)`__
   This is a low volume list that provides important announcements regarding
   LLVM.  It gets email about once a month.
 
-  .. __: http://lists.cs.uiuc.edu/mailman/listinfo/llvm-announce
+  .. __: http://lists.llvm.org/mailman/listinfo/llvm-announce
 
 IRC
 ---
diff --git a/docs/tutorial/LangImpl9.rst b/docs/tutorial/LangImpl9.rst
index 33987687dee6..6c43d53f90f9 100644
--- a/docs/tutorial/LangImpl9.rst
+++ b/docs/tutorial/LangImpl9.rst
@@ -90,8 +90,8 @@ For example, try adding:
 Have fun - try doing something crazy and unusual. Building a language
 like everyone else always has, is much less fun than trying something a
 little crazy or off the wall and seeing how it turns out. If you get
-stuck or want to talk about it, feel free to email the `llvmdev mailing
-list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_: it has lots
+stuck or want to talk about it, feel free to email the `llvm-dev mailing
+list <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_: it has lots
 of people who are interested in languages and are often willing to help
 out.
 
@@ -169,8 +169,8 @@ It is certainly possible to implement a safe language in LLVM, but LLVM
 IR does not itself guarantee safety. The LLVM IR allows unsafe pointer
 casts, use after free bugs, buffer over-runs, and a variety of other
 problems. Safety needs to be implemented as a layer on top of LLVM and,
-conveniently, several groups have investigated this. Ask on the `llvmdev
-mailing list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ if
+conveniently, several groups have investigated this. Ask on the `llvm-dev
+mailing list <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ if
 you are interested in more details.
 
 Language-Specific Optimizations
@@ -220,7 +220,7 @@ safe to optimize that into "return 0;" because C specifies what the
 In addition to simple library knowledge, it is possible to embed a
 variety of other language-specific information into the LLVM IR. If you
 have a specific need and run into a wall, please bring the topic up on
-the llvmdev list. At the very worst, you can always treat LLVM as if it
+the llvm-dev list. At the very worst, you can always treat LLVM as if it
 were a "dumb code generator" and implement the high-level optimizations
 you desire in your front-end, on the language-specific AST.
 
diff --git a/docs/tutorial/OCamlLangImpl8.rst b/docs/tutorial/OCamlLangImpl8.rst
index 6f694931ef86..0346fa9fed14 100644
--- a/docs/tutorial/OCamlLangImpl8.rst
+++ b/docs/tutorial/OCamlLangImpl8.rst
@@ -95,8 +95,8 @@ For example, try adding:
 Have fun - try doing something crazy and unusual. Building a language
 like everyone else always has, is much less fun than trying something a
 little crazy or off the wall and seeing how it turns out. If you get
-stuck or want to talk about it, feel free to email the `llvmdev mailing
-list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_: it has lots
+stuck or want to talk about it, feel free to email the `llvm-dev mailing
+list <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_: it has lots
 of people who are interested in languages and are often willing to help
 out.
 
@@ -174,8 +174,8 @@ It is certainly possible to implement a safe language in LLVM, but LLVM
 IR does not itself guarantee safety. The LLVM IR allows unsafe pointer
 casts, use after free bugs, buffer over-runs, and a variety of other
 problems. Safety needs to be implemented as a layer on top of LLVM and,
-conveniently, several groups have investigated this. Ask on the `llvmdev
-mailing list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ if
+conveniently, several groups have investigated this. Ask on the `llvm-dev
+mailing list <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ if
 you are interested in more details.
 
 Language-Specific Optimizations
@@ -225,7 +225,7 @@ safe to optimize that into "return 0;" because C specifies what the
 In addition to simple library knowledge, it is possible to embed a
 variety of other language-specific information into the LLVM IR. If you
 have a specific need and run into a wall, please bring the topic up on
-the llvmdev list. At the very worst, you can always treat LLVM as if it
+the llvm-dev list. At the very worst, you can always treat LLVM as if it
 were a "dumb code generator" and implement the high-level optimizations
 you desire in your front-end, on the language-specific AST.
 
diff --git a/include/llvm-c/TargetMachine.h b/include/llvm-c/TargetMachine.h
index d4993e7e6da1..8cf1f43cb3c5 100644
--- a/include/llvm-c/TargetMachine.h
+++ b/include/llvm-c/TargetMachine.h
@@ -115,7 +115,7 @@ char *LLVMGetTargetMachineCPU(LLVMTargetMachineRef T);
   LLVMDisposeMessage. */
 char *LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T);
 
-/** Returns the llvm::DataLayout used for this llvm:TargetMachine. */
+/** Deprecated: use LLVMGetDataLayout(LLVMModuleRef M) instead. */
 LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T);
 
 /** Set the target machine's ASM verbosity. */
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index 5b208b76a21f..b9384702c3ba 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -315,8 +315,10 @@ protected:
                                            T2>::value>::type * = nullptr) {
     // Use memcpy for PODs iterated by pointers (which includes SmallVector
     // iterators): std::uninitialized_copy optimizes to memmove, but we can
-    // use memcpy here.
-    memcpy(Dest, I, (E-I)*sizeof(T));
+    // use memcpy here. Note that I and E are iterators and thus might be
+    // invalid for memcpy if they are equal.
+    if (I != E)
+      memcpy(Dest, I, (E - I) * sizeof(T));
   }
 
   /// Double the size of the allocated memory, guaranteeing space for at
diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index 8721c73b95b1..9d038560bf92 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -158,7 +158,8 @@ public:
 
     // Copy the string information.
     char *StrBuffer = const_cast<char*>(NewItem->getKeyData());
-    memcpy(StrBuffer, Key.data(), KeyLength);
+    if (KeyLength > 0)
+      memcpy(StrBuffer, Key.data(), KeyLength);
     StrBuffer[KeyLength] = 0;  // Null terminate for convenience of clients.
     return NewItem;
   }
diff --git a/include/llvm/CodeGen/LiveRegMatrix.h b/include/llvm/CodeGen/LiveRegMatrix.h
index e169058ca563..86a0c7bd626f 100644
--- a/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/include/llvm/CodeGen/LiveRegMatrix.h
@@ -32,11 +32,13 @@ namespace llvm {
 
 class LiveInterval;
 class LiveIntervalAnalysis;
+class MachineRegisterInfo;
 class TargetRegisterInfo;
 class VirtRegMap;
 
 class LiveRegMatrix : public MachineFunctionPass {
   const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
   LiveIntervals *LIS;
   VirtRegMap *VRM;
 
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 67583be616c3..5e607cdae48e 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -95,8 +95,20 @@ private:
     return MO->Contents.Reg.Next;
   }
 
+  /// UsedRegUnits - This is a bit vector that is computed and set by the
+  /// register allocator, and must be kept up to date by passes that run after
+  /// register allocation (though most don't modify this).  This is used
+  /// so that the code generator knows which callee save registers to save and
+  /// for other target specific uses.
+  /// This vector has bits set for register units that are modified in the
+  /// current function. It doesn't include registers clobbered by function
+  /// calls with register mask operands.
+  BitVector UsedRegUnits;
+
   /// UsedPhysRegMask - Additional used physregs including aliases.
   /// This bit vector represents all the registers clobbered by function calls.
+  /// It can model things that UsedRegUnits can't, such as function calls that
+  /// clobber ymm7 but preserve the low half in xmm7.
   BitVector UsedPhysRegMask;
 
   /// ReservedRegs - This is a bit vector of reserved registers.  The target
@@ -641,12 +653,55 @@ public:
   /// ignored.
   bool isPhysRegModified(unsigned PhysReg) const;
 
+  //===--------------------------------------------------------------------===//
+  // Physical Register Use Info
+  //===--------------------------------------------------------------------===//
+
+  /// isPhysRegUsed - Return true if the specified register is used in this
+  /// function. Also check for clobbered aliases and registers clobbered by
+  /// function calls with register mask operands.
+  ///
+  /// This only works after register allocation.
+  bool isPhysRegUsed(unsigned Reg) const {
+    if (UsedPhysRegMask.test(Reg))
+      return true;
+    for (MCRegUnitIterator Units(Reg, getTargetRegisterInfo());
+         Units.isValid(); ++Units)
+      if (UsedRegUnits.test(*Units))
+        return true;
+    return false;
+  }
+
+  /// Mark the specified register unit as used in this function.
+  /// This should only be called during and after register allocation.
+  void setRegUnitUsed(unsigned RegUnit) {
+    UsedRegUnits.set(RegUnit);
+  }
+
+  /// setPhysRegUsed - Mark the specified register used in this function.
+  /// This should only be called during and after register allocation.
+  void setPhysRegUsed(unsigned Reg) {
+    for (MCRegUnitIterator Units(Reg, getTargetRegisterInfo());
+         Units.isValid(); ++Units)
+      UsedRegUnits.set(*Units);
+  }
+
   /// addPhysRegsUsedFromRegMask - Mark any registers not in RegMask as used.
   /// This corresponds to the bit mask attached to register mask operands.
   void addPhysRegsUsedFromRegMask(const uint32_t *RegMask) {
     UsedPhysRegMask.setBitsNotInMask(RegMask);
   }
 
+  /// setPhysRegUnused - Mark the specified register unused in this function.
+  /// This should only be called during and after register allocation.
+  void setPhysRegUnused(unsigned Reg) {
+    UsedPhysRegMask.reset(Reg);
+    for (MCRegUnitIterator Units(Reg, getTargetRegisterInfo());
+         Units.isValid(); ++Units)
+      UsedRegUnits.reset(*Units);
+  }
+
+
   //===--------------------------------------------------------------------===//
   // Reserved Register Info
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index 06a2b13836ed..f1e9d1718f5a 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -125,10 +125,15 @@ public:
     return *static_cast<const STC*>(getSubtargetImpl(F));
   }
 
+  /// Deprecated in 3.7, will be removed in 3.8. Use createDataLayout() instead.
+  ///
   /// This method returns a pointer to the DataLayout for the target. It should
   /// be unchanging for every subtarget.
   const DataLayout *getDataLayout() const { return &DL; }
 
+  /// Create a DataLayout.
+  const DataLayout createDataLayout() const { return DL; }
+
   /// \brief Reset the target options based on the function's attributes.
   // FIXME: Remove TargetOptions that affect per-function code generation
   // from TargetMachine.
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 68f766edb301..35863542f437 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -206,14 +206,6 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
     return V;
   }
 
-  if (ConstantInt *Const = dyn_cast<ConstantInt>(V)) {
-    // if it's a constant, just convert it to an offset
-    // and remove the variable.
-    Offset += Const->getValue();
-    assert(Scale == 0 && "Constant values don't have a scale");
-    return V;
-  }
-
   if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
       switch (BOp->getOpcode()) {
@@ -261,10 +253,7 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
     Value *Result = GetLinearExpression(CastOp, Scale, Offset, Extension, DL,
                                         Depth + 1, AC, DT);
     Scale = Scale.zext(OldWidth);
-
-    // We have to sign-extend even if Extension == EK_ZeroExt as we can't
-    // decompose a sign extension (i.e. zext(x - 1) != zext(x) - zext(-1)).
-    Offset = Offset.sext(OldWidth);
+    Offset = Offset.zext(OldWidth);
 
     return Result;
   }
@@ -1135,43 +1124,12 @@ AliasResult BasicAliasAnalysis::aliasGEP(
     }
   }
 
+  // Try to distinguish something like &A[i][1] against &A[42][0].
+  // Grab the least significant bit set in any of the scales.
   if (!GEP1VariableIndices.empty()) {
     uint64_t Modulo = 0;
-    bool AllPositive = true;
-    for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i) {
-
-      // Try to distinguish something like &A[i][1] against &A[42][0].
-      // Grab the least significant bit set in any of the scales. We
-      // don't need std::abs here (even if the scale's negative) as we'll
-      // be ^'ing Modulo with itself later.
+    for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i)
       Modulo |= (uint64_t) GEP1VariableIndices[i].Scale;
-
-      if (AllPositive) {
-        // If the Value could change between cycles, then any reasoning about
-        // the Value this cycle may not hold in the next cycle. We'll just
-        // give up if we can't determine conditions that hold for every cycle:
-        const Value *V = GEP1VariableIndices[i].V;
-
-        bool SignKnownZero, SignKnownOne;
-        ComputeSignBit(const_cast<Value *>(V), SignKnownZero, SignKnownOne, *DL,
-                       0, AC1, nullptr, DT);
-
-        // Zero-extension widens the variable, and so forces the sign
-        // bit to zero.
-        bool IsZExt = GEP1VariableIndices[i].Extension == EK_ZeroExt;
-        SignKnownZero |= IsZExt;
-        SignKnownOne &= !IsZExt;
-
-        // If the variable begins with a zero then we know it's
-        // positive, regardless of whether the value is signed or
-        // unsigned.
-        int64_t Scale = GEP1VariableIndices[i].Scale;
-        AllPositive =
-          (SignKnownZero && Scale >= 0) ||
-          (SignKnownOne && Scale < 0);
-      }
-    }
-
     Modulo = Modulo ^ (Modulo & (Modulo - 1));
 
     // We can compute the difference between the two addresses
@@ -1182,12 +1140,6 @@ AliasResult BasicAliasAnalysis::aliasGEP(
         V2Size != MemoryLocation::UnknownSize && ModOffset >= V2Size &&
         V1Size <= Modulo - ModOffset)
       return NoAlias;
-
-    // If we know all the variables are positive, then GEP1 >= GEP1BasePtr.
-    // If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers
-    // don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr.
-    if (AllPositive && GEP1BaseOffset > 0 && V2Size <= (uint64_t) GEP1BaseOffset)
-      return NoAlias;
   }
 
   // Statically, we can see that the base objects are the same, but the
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index 18d45dd6a396..28fb49c89019 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -440,30 +440,39 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
     }
 
     // Scan the function bodies for explicit loads or stores.
-    for (unsigned i = 0, e = SCC.size(); i != e && FunctionEffect != ModRef;
-         ++i)
-      for (inst_iterator II = inst_begin(SCC[i]->getFunction()),
-                         E = inst_end(SCC[i]->getFunction());
-           II != E && FunctionEffect != ModRef; ++II)
-        if (LoadInst *LI = dyn_cast<LoadInst>(&*II)) {
+    for (auto *Node : SCC) {
+      if (FunctionEffect == ModRef)
+        break; // The mod/ref lattice saturates here.
+      for (Instruction &I : inst_range(Node->getFunction())) {
+        if (FunctionEffect == ModRef)
+          break; // The mod/ref lattice saturates here.
+
+        // We handle calls specially because the graph-relevant aspects are
+        // handled above.
+        if (auto CS = CallSite(&I)) {
+          if (isAllocationFn(&I, TLI) || isFreeCall(&I, TLI)) {
+            // FIXME: It is completely unclear why this is necessary and not
+            // handled by the above graph code.
+            FunctionEffect |= ModRef;
+          } else if (Function *Callee = CS.getCalledFunction()) {
+            // The callgraph doesn't include intrinsic calls.
+            if (Callee->isIntrinsic()) {
+              ModRefBehavior Behaviour =
+                  AliasAnalysis::getModRefBehavior(Callee);
+              FunctionEffect |= (Behaviour & ModRef);
+            }
+          }
+          continue;
+        }
+
+        // All non-call instructions we use the primary predicates for whether
+        // thay read or write memory.
+        if (I.mayReadFromMemory())
           FunctionEffect |= Ref;
-          if (LI->isVolatile())
-            // Volatile loads may have side-effects, so mark them as writing
-            // memory (for example, a flag inside the processor).
-            FunctionEffect |= Mod;
-        } else if (StoreInst *SI = dyn_cast<StoreInst>(&*II)) {
+        if (I.mayWriteToMemory())
           FunctionEffect |= Mod;
-          if (SI->isVolatile())
-            // Treat volatile stores as reading memory somewhere.
-            FunctionEffect |= Ref;
-        } else if (isAllocationFn(&*II, TLI) || isFreeCall(&*II, TLI)) {
-          FunctionEffect |= ModRef;
-        } else if (IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(&*II)) {
-          // The callgraph doesn't include intrinsic calls.
-          Function *Callee = Intrinsic->getCalledFunction();
-          ModRefBehavior Behaviour = AliasAnalysis::getModRefBehavior(Callee);
-          FunctionEffect |= (Behaviour & ModRef);
-        }
+      }
+    }
 
     if ((FunctionEffect & Mod) == 0)
       ++NumReadMemFunctions;
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index fa42b48b6cdb..a7f8f5c8c99b 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -3574,18 +3574,9 @@ static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const Query &,
 
   // If extracting a specified index from the vector, see if we can recursively
   // find a previously computed scalar that was inserted into the vector.
-  if (auto *IdxC = dyn_cast<ConstantInt>(Idx)) {
-    unsigned IndexVal = IdxC->getZExtValue();
-    unsigned VectorWidth = Vec->getType()->getVectorNumElements();
-
-    // If this is extracting an invalid index, turn this into undef, to avoid
-    // crashing the code below.
-    if (IndexVal >= VectorWidth)
-      return UndefValue::get(Vec->getType()->getVectorElementType());
-
-    if (Value *Elt = findScalarElement(Vec, IndexVal))
+  if (auto *IdxC = dyn_cast<ConstantInt>(Idx))
+    if (Value *Elt = findScalarElement(Vec, IdxC->getZExtValue()))
       return Elt;
-  }
 
   return nullptr;
 }
diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp
index 8d80c6028ba3..f7545ea05a39 100644
--- a/lib/Analysis/PHITransAddr.cpp
+++ b/lib/Analysis/PHITransAddr.cpp
@@ -374,9 +374,10 @@ InsertPHITranslatedSubExpr(Value *InVal, BasicBlock *CurBB,
   if (!Tmp.PHITranslateValue(CurBB, PredBB, &DT, /*MustDominate=*/true))
     return Tmp.getAddr();
 
-  // If we don't have an available version of this value, it must be an
-  // instruction.
-  Instruction *Inst = cast<Instruction>(InVal);
+  // We don't need to PHI translate values which aren't instructions.
+  auto *Inst = dyn_cast<Instruction>(InVal);
+  if (!Inst)
+    return nullptr;
 
   // Handle cast of PHI translatable value.
   if (CastInst *Cast = dyn_cast<CastInst>(Inst)) {
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 67f68dc8391e..8c671ef0ef0e 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -402,8 +402,9 @@ llvm::Value *llvm::findScalarElement(llvm::Value *V, unsigned EltNo) {
   if (match(V,
             llvm::PatternMatch::m_Add(llvm::PatternMatch::m_Value(Val),
                                       llvm::PatternMatch::m_Constant(Con)))) {
-    if (Con->getAggregateElement(EltNo)->isNullValue())
-      return findScalarElement(Val, EltNo);
+    if (Constant *Elt = Con->getAggregateElement(EltNo))
+      if (Elt->isNullValue())
+        return findScalarElement(Val, EltNo);
   }
 
   // Otherwise, we don't know.
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 201f9c150083..5b09cf1a0fd7 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -733,14 +733,12 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
   // If no relevant registers are used in the function, we can skip it
   // completely.
   bool anyregs = false;
-  const MachineRegisterInfo &MRI = mf.getRegInfo();
   for (TargetRegisterClass::const_iterator I = RC->begin(), E = RC->end();
-       I != E && !anyregs; ++I)
-    for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI)
-      if (!MRI.reg_nodbg_empty(*AI)) {
-        anyregs = true;
-        break;
-      }
+       I != E; ++I)
+    if (MF->getRegInfo().isPhysRegUsed(*I)) {
+      anyregs = true;
+      break;
+    }
   if (!anyregs) return false;
 
   // Initialize the AliasMap on the first use.
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index 000151acd735..9ea031d38d29 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -15,12 +15,12 @@
 #include "RegisterCoalescer.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -49,6 +49,7 @@ void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {
 
 bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
   VRM = &getAnalysis<VirtRegMap>();
 
@@ -100,6 +101,7 @@ void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
                << " to " << PrintReg(PhysReg, TRI) << ':');
   assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
   VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
+  MRI->setPhysRegUsed(PhysReg);
 
   foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
                                          const LiveRange &Range) {
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 5984af87a184..e883ce523134 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -29,6 +29,7 @@ MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF)
     TracksSubRegLiveness(false) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
+  UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits());
   UsedPhysRegMask.resize(getTargetRegisterInfo()->getNumRegs());
 
   // Create the physreg use/def lists.
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index 9404c687d410..d9a6b68462eb 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -624,6 +624,10 @@ struct DataDep {
 static bool getDataDeps(const MachineInstr *UseMI,
                         SmallVectorImpl<DataDep> &Deps,
                         const MachineRegisterInfo *MRI) {
+  // Debug values should not be included in any calculations.
+  if (UseMI->isDebugValue())
+    return false;
+  
   bool HasPhysRegs = false;
   for (MachineInstr::const_mop_iterator I = UseMI->operands_begin(),
        E = UseMI->operands_end(); I != E; ++I) {
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index b2fdee6c8e4c..6ca69a124297 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -1026,8 +1026,12 @@ PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
           // Replace this reference to the virtual register with the
           // scratch register.
           assert (ScratchReg && "Missing scratch register!");
+          MachineRegisterInfo &MRI = Fn.getRegInfo();
           Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
           
+          // Make sure MRI now accounts this register as used.
+          MRI.setPhysRegUsed(ScratchReg);
+
           // Because this instruction was processed by the RS before this
           // register was allocated, make sure that the RS now records the
           // register as being used.
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 660bb4f0e34d..fd3d4d78968b 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -986,6 +986,10 @@ void RAFast::AllocateBasicBlock() {
       }
     }
 
+    for (UsedInInstrSet::iterator
+         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
+      MRI->setRegUnitUsed(*I);
+
     // Track registers defined by instruction - early clobbers and tied uses at
     // this point.
     UsedInInstr.clear();
@@ -1046,6 +1050,10 @@ void RAFast::AllocateBasicBlock() {
       killVirtReg(VirtDead[i]);
     VirtDead.clear();
 
+    for (UsedInInstrSet::iterator
+         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
+      MRI->setRegUnitUsed(*I);
+
     if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) {
       DEBUG(dbgs() << "-- coalescing: " << *MI);
       Coalesced.push_back(MI);
@@ -1095,6 +1103,12 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
     AllocateBasicBlock();
   }
 
+  // Add the clobber lists for all the instructions we skipped earlier.
+  for (const MCInstrDesc *Desc : SkippedInstrs)
+    if (const uint16_t *Defs = Desc->getImplicitDefs())
+      while (*Defs)
+        MRI->setPhysRegUsed(*Defs++);
+
   // All machine operands and other references to virtual registers have been
   // replaced. Remove the virtual registers.
   MRI->clearVirtRegs();
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 7afea2a4f6ff..c911b9b47ea2 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -1531,6 +1531,14 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
         DEBUG(dbgs() << "\t\tInterference (read): " << *MI);
         return false;
       }
+
+      // We must also check for clobbers caused by regmasks.
+      for (const auto &MO : MI->operands()) {
+        if (MO.isRegMask() && MO.clobbersPhysReg(DstReg)) {
+          DEBUG(dbgs() << "\t\tInterference (regmask clobber): " << *MI);
+          return false;
+        }
+      }
     }
 
     // We're going to remove the copy which defines a physical reserved
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 52d620b1d540..3b29306bb54a 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8365,12 +8365,12 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     if (N0CFP && N0CFP->isExactlyValue(1.0))
       return SDValue();
 
-    SmallVector<SDNode *, 4> Users;
     // Find all FDIV users of the same divisor.
-    for (auto *U : N1->uses()) {
+    // Use a set because duplicates may be present in the user list.
+    SetVector<SDNode *> Users;
+    for (auto *U : N1->uses())
       if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1)
-        Users.push_back(U);
-    }
+        Users.insert(U);
 
     if (TLI.combineRepeatedFPDivisors(Users.size())) {
       SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 02341b4d66b8..2912bdd63426 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -163,6 +163,7 @@ class VirtRegRewriter : public MachineFunctionPass {
   SlotIndexes *Indexes;
   LiveIntervals *LIS;
   VirtRegMap *VRM;
+  SparseSet<unsigned> PhysRegs;
 
   void rewrite();
   void addMBBLiveIns();
@@ -318,15 +319,54 @@ void VirtRegRewriter::rewrite() {
   SmallVector<unsigned, 8> SuperDeads;
   SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
+  SmallPtrSet<const MachineInstr *, 4> NoReturnInsts;
+
+  // Here we have a SparseSet to hold which PhysRegs are actually encountered
+  // in the MF we are about to iterate over so that later when we call
+  // setPhysRegUsed, we are only doing it for physRegs that were actually found
+  // in the program and not for all of the possible physRegs for the given
+  // target architecture. If the target has a lot of physRegs, then for a small
+  // program there will be a significant compile time reduction here.
+  PhysRegs.clear();
+  PhysRegs.setUniverse(TRI->getNumRegs());
+
+  // The function with uwtable should guarantee that the stack unwinder
+  // can unwind the stack to the previous frame.  Thus, we can't apply the
+  // noreturn optimization if the caller function has uwtable attribute.
+  bool HasUWTable = MF->getFunction()->hasFnAttribute(Attribute::UWTable);
 
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
     DEBUG(MBBI->print(dbgs(), Indexes));
+    bool IsExitBB = MBBI->succ_empty();
     for (MachineBasicBlock::instr_iterator
            MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) {
       MachineInstr *MI = MII;
       ++MII;
 
+      // Check if this instruction is a call to a noreturn function.  If this
+      // is a call to noreturn function and we don't need the stack unwinding
+      // functionality (i.e. this function does not have uwtable attribute and
+      // the callee function has the nounwind attribute), then we can ignore
+      // the definitions set by this instruction.
+      if (!HasUWTable && IsExitBB && MI->isCall()) {
+        for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+               MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+          MachineOperand &MO = *MOI;
+          if (!MO.isGlobal())
+            continue;
+          const Function *Func = dyn_cast<Function>(MO.getGlobal());
+          if (!Func || !Func->hasFnAttribute(Attribute::NoReturn) ||
+              // We need to keep correct unwind information
+              // even if the function will not return, since the
+              // runtime may need it.
+              !Func->hasFnAttribute(Attribute::NoUnwind))
+            continue;
+          NoReturnInsts.insert(MI);
+          break;
+        }
+      }
+
       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
            MOE = MI->operands_end(); MOI != MOE; ++MOI) {
         MachineOperand &MO = *MOI;
@@ -335,6 +375,15 @@ void VirtRegRewriter::rewrite() {
         if (MO.isRegMask())
           MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
 
+        // If we encounter a VirtReg or PhysReg then get at the PhysReg and add
+        // it to the physreg bitset.  Later we use only the PhysRegs that were
+        // actually encountered in the MF to populate the MRI's used physregs.
+        if (MO.isReg() && MO.getReg())
+          PhysRegs.insert(
+              TargetRegisterInfo::isVirtualRegister(MO.getReg()) ?
+              VRM->getPhys(MO.getReg()) :
+              MO.getReg());
+
         if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
           continue;
         unsigned VirtReg = MO.getReg();
@@ -421,5 +470,29 @@ void VirtRegRewriter::rewrite() {
       }
     }
   }
+
+  // Tell MRI about physical registers in use.
+  if (NoReturnInsts.empty()) {
+    for (SparseSet<unsigned>::iterator
+        RegI = PhysRegs.begin(), E = PhysRegs.end(); RegI != E; ++RegI)
+      if (!MRI->reg_nodbg_empty(*RegI))
+        MRI->setPhysRegUsed(*RegI);
+  } else {
+    for (SparseSet<unsigned>::iterator
+        I = PhysRegs.begin(), E = PhysRegs.end(); I != E; ++I) {
+      unsigned Reg = *I;
+      if (MRI->reg_nodbg_empty(Reg))
+        continue;
+      // Check if this register has a use that will impact the rest of the
+      // code. Uses in debug and noreturn instructions do not impact the
+      // generated code.
+      for (MachineInstr &It : MRI->reg_nodbg_instructions(Reg)) {
+        if (!NoReturnInsts.count(&It)) {
+          MRI->setPhysRegUsed(Reg);
+          break;
+        }
+      }
+    }
+  }
 }
 
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index c2ff8e27af47..67a1ca67e2f3 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -180,10 +180,17 @@ uint64_t ExecutionEngineState::RemoveMapping(StringRef Name) {
 }
 
 std::string ExecutionEngine::getMangledName(const GlobalValue *GV) {
+  assert(GV->hasName() && "Global must have name.");
+
   MutexGuard locked(lock);
-  Mangler Mang;
   SmallString<128> FullName;
-  Mang.getNameWithPrefix(FullName, GV, false);
+
+  const DataLayout &DL =
+    GV->getParent()->getDataLayout().isDefault()
+      ? *getDataLayout()
+      : GV->getParent()->getDataLayout();
+
+  Mangler::getNameWithPrefix(FullName, GV->getName(), DL);
   return FullName.str();
 }
 
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index a7d67050c7a6..f6944eea2e78 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -266,6 +266,12 @@ void MCJIT::finalizeModule(Module *M) {
 RuntimeDyld::SymbolInfo MCJIT::findExistingSymbol(const std::string &Name) {
   SmallString<128> FullName;
   Mangler::getNameWithPrefix(FullName, Name, *TM->getDataLayout());
+
+  if (void *Addr = getPointerToGlobalIfAvailable(FullName))
+    return RuntimeDyld::SymbolInfo(static_cast<uint64_t>(
+                                     reinterpret_cast<uintptr_t>(Addr)),
+                                   JITSymbolFlags::Exported);
+
   return Dyld.getSymbol(FullName);
 }
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index 044eee43c9e7..ecd99004bade 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -98,7 +98,7 @@ void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
                                            uint64_t LoadAddr,
                                            size_t Size) {
   // On OS X OS X __register_frame takes a single FDE as an argument.
-  // See http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-April/061768.html
+  // See http://lists.llvm.org/pipermail/llvm-dev/2013-April/061768.html
   const char *P = (const char *)Addr;
   const char *End = P + Size;
   do  {
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index b5c4e5d4c6d5..a9ca80034ca7 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -613,6 +613,9 @@ bool StructType::isLayoutIdentical(StructType *Other) const {
   if (isPacked() != Other->isPacked() ||
       getNumElements() != Other->getNumElements())
     return false;
+
+  if (!getNumElements())
+    return true;
   
   return std::equal(element_begin(), element_end(), Other->element_begin());
 }
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 98862e96b749..d09ef3a4c0bc 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -57,7 +57,8 @@ void MemoryBuffer::init(const char *BufStart, const char *BufEnd,
 /// CopyStringRef - Copies contents of a StringRef into a block of memory and
 /// null-terminates it.
 static void CopyStringRef(char *Memory, StringRef Data) {
-  memcpy(Memory, Data.data(), Data.size());
+  if (!Data.empty())
+    memcpy(Memory, Data.data(), Data.size());
   Memory[Data.size()] = 0; // Null terminate string.
 }
 
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 79a84ad8c6c5..9d6dbd641a16 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -593,6 +593,7 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
       if (Change) {
         Substs[MO.getReg()] = Reg;
         MO.setReg(Reg);
+        MRI->setPhysRegUsed(Reg);
 
         Changed = true;
       }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index a7817f4f67dd..a76473f7e539 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -354,6 +354,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   if (NumBytes && NeedsRealignment) {
     // Use the first callee-saved register as a scratch register.
     scratchSPReg = AArch64::X9;
+    MF.getRegInfo().setPhysRegUsed(scratchSPReg);
   }
 
   // If we're a leaf function, try using the red zone.
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index ef8ef6268548..68b50504ee44 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -123,6 +123,11 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
         "true",
         "VI SGPR initilization bug requiring a fixed SGPR allocation size">;
 
+def FeatureEnableHugeScratchBuffer : SubtargetFeature<"huge-scratch-buffer",
+        "EnableHugeScratchBuffer",
+        "true",
+        "Enable scratch buffer sizes greater than 128 GB">;
+
 class SubtargetFeatureFetchLimit <string Value> :
                           SubtargetFeature <"fetch"#Value,
         "TexVTXClauseSize",
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 37b77d778d9f..64c54ccb31ff 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1029,6 +1029,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                                            SDValue &SLC, SDValue &TFE) const {
   SDValue Ptr, Offen, Idxen, Addr64;
 
+  // addr64 bit was removed for volcanic islands.
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return false;
+
   SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
               GLC, SLC, TFE);
 
@@ -1095,13 +1099,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
 
   // (add n0, c1)
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
-    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-
-    if (isLegalMUBUFImmOffset(C1)) {
-      VAddr = Addr.getOperand(0);
-      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
-      return true;
+    // Offsets in vaddr must be positive.
+    if (CurDAG->SignBitIsZero(N0)) {
+      ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+      if (isLegalMUBUFImmOffset(C1)) {
+        VAddr = N0;
+        ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+        return true;
+      }
     }
   }
 
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index bd5abc4f546e..5f32a65c9338 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -73,7 +73,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
       WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
       EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
       GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
-      IsaVersion(ISAVersion0_0_0),
+      IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
       FrameLowering(TargetFrameLowering::StackGrowsUp,
                     64 * 16, // Maximum stack alignment (long16)
                     0),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 90831bfb4458..735f01dfa7c5 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -89,6 +89,7 @@ private:
   bool FeatureDisable;
   int LDSBankCount;
   unsigned IsaVersion; 
+  bool EnableHugeScratchBuffer;
 
   AMDGPUFrameLowering FrameLowering;
   std::unique_ptr<AMDGPUTargetLowering> TLInfo;
@@ -271,6 +272,10 @@ public:
     return DevName;
   }
 
+  bool enableHugeScratchBuffer() const {
+    return EnableHugeScratchBuffer;
+  }
+
   bool dumpCode() const {
     return DumpCode;
   }
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index c9b25a1a0b84..d918ac3a5b3b 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1719,7 +1719,6 @@ MachineBasicBlock *
 AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
   MachineBasicBlock *LoopHeader = LoopRep->getHeader();
   MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
 
   if (!LoopHeader || !LoopLatch)
     return nullptr;
@@ -1732,18 +1731,9 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
   FuncRep->push_back(DummyExitBlk);  //insert to function
   SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
   DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
-  MachineBasicBlock::iterator I = BranchMI;
-  unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC);
-  llvm_unreachable("Extra register needed to handle CFG");
-  MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32);
-  MachineInstrBuilder MIB(*FuncRep, NewMI);
-  MIB.addMBB(LoopHeader);
-  MIB.addReg(ImmReg, false);
-  SHOWNEWINSTR(NewMI);
-  BranchMI->eraseFromParent();
-  LoopLatch->addSuccessor(DummyExitBlk);
-
-  return DummyExitBlk;
+  LLVMContext &Ctx = LoopHeader->getParent()->getFunction()->getContext();
+  Ctx.emitError("Extra register needed to handle CFG");
+  return nullptr;
 }
 
 void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index 69efb8b8bc43..d9a0723bedc9 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -138,3 +138,7 @@ def : ProcessorModel<"iceland", SIQuarterSpeedModel,
 def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
   [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
 >;
+
+def : ProcessorModel<"fiji", SIQuarterSpeedModel,
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+>;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index dd818a9ba746..099b0b15942b 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -254,6 +254,12 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
   return false;
 }
 
+bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
+  // Flat instructions do not have offsets, and only have the register
+  // address.
+  return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+}
+
 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                              const AddrMode &AM, Type *Ty,
                                              unsigned AS) const {
@@ -263,8 +269,21 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
 
   switch (AS) {
   case AMDGPUAS::GLOBAL_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      // Assume the we will use FLAT for all global memory accesses
+      // on VI.
+      // FIXME: This assumption is currently wrong.  On VI we still use
+      // MUBUF instructions for the r + i addressing mode.  As currently
+      // implemented, the MUBUF instructions only work on buffer < 4GB.
+      // It may be possible to support > 4GB buffers with MUBUF instructions,
+      // by setting the stride value in the resource descriptor which would
+      // increase the size limit to (stride * 4GB).  However, this is risky,
+      // because it has never been validated.
+      return isLegalFlatAddressingMode(AM);
+    }
+    // fall-through
   case AMDGPUAS::PRIVATE_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
   case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
     // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
     // additionally can do r + r + i with addr64. 32-bit has more addressing
@@ -324,11 +343,9 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
 
     return false;
   }
-  case AMDGPUAS::FLAT_ADDRESS: {
-    // Flat instructions do not have offsets, and only have the register
-    // address.
-    return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
-  }
+  case AMDGPUAS::FLAT_ADDRESS:
+    return isLegalFlatAddressingMode(AM);
+
   default:
     llvm_unreachable("unhandled address space");
   }
@@ -812,10 +829,29 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
 SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 
+  SDLoc SL(Op);
   FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
   unsigned FrameIndex = FINode->getIndex();
 
-  return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+  // A FrameIndex node represents a 32-bit offset into scratch memory.  If
+  // the high bit of a frame index offset were to be set, this would mean
+  // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
+  // scratch buffer, with 64 being the number of threads per wave.
+  //
+  // If we know the machine uses less than 128GB of scratch, then we can
+  // amrk the high bit of the FrameIndex node as known zero,
+  // which is important, because it means in most situations we can
+  // prove that values derived from FrameIndex nodes are non-negative.
+  // This enables us to take advantage of more addressing modes when
+  // accessing scratch buffers, since for scratch reads/writes, the register
+  // offset must always be positive.
+
+  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+  if (Subtarget->enableHugeScratchBuffer())
+    return TFI;
+
+  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
+                    DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
 }
 
 /// This transforms the control flow intrinsics to get the branch destination as
@@ -2034,6 +2070,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   }
 }
 
+static bool isFrameIndexOp(SDValue Op) {
+  if (Op.getOpcode() == ISD::AssertZext)
+    Op = Op.getOperand(0);
+
+  return isa<FrameIndexSDNode>(Op);
+}
+
 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
 /// with frame index operands.
 /// LLVM assumes that inputs are to these instructions are registers.
@@ -2042,7 +2085,7 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
 
   SmallVector<SDValue, 8> Ops;
   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
-    if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
+    if (!isFrameIndexOp(Node->getOperand(i))) {
       Ops.push_back(Node->getOperand(i));
       continue;
     }
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 635b4edc89de..d84c32ec0092 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -56,6 +56,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
+  bool isLegalFlatAddressingMode(const AddrMode &AM) const;
 public:
   SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
 
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index b39a78714640..8d8110bca4c5 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1600,12 +1600,14 @@ multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern,
             SIMCInstr <opName#"_e32", SISubtarget.SI> {
     let Defs = !if(DefExec, [EXEC], []);
     let hasSideEffects = DefExec;
+    let AssemblerPredicates = [isSICI];
   }
 
   def _vi : VOPC<op.VI, ins, asm, []>,
             SIMCInstr <opName#"_e32", SISubtarget.VI> {
     let Defs = !if(DefExec, [EXEC], []);
     let hasSideEffects = DefExec;
+    let AssemblerPredicates = [isVI];
   }
 }
 
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 1ee63c675822..f78ffd72314c 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -2910,9 +2910,6 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
 } // End Predicates = [isSICI]
 
 class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
@@ -3273,13 +3270,13 @@ def : Pat <
   (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
              (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
   (V_CNDMASK_B64_PSEUDO
-      $x,
       (V_MIN_F64
           SRCMODS.NONE,
           (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
           SRCMODS.NONE,
           (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
           DSTCLAMP.NONE, DSTOMOD.NONE),
+      $x,
       (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/))
 >;
 
@@ -3291,13 +3288,13 @@ def : Pat <
       $x,
       SRCMODS.NEG,
       (V_CNDMASK_B64_PSEUDO
-         $x,
          (V_MIN_F64
              SRCMODS.NONE,
              (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
              SRCMODS.NONE,
              (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
              DSTCLAMP.NONE, DSTOMOD.NONE),
+         $x,
          (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)),
       DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index d23b92edef33..587ea63d6796 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -53,6 +53,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
   if (!LaneVGPRs.count(LaneVGPRIdx)) {
     unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
     LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
+    MRI.setPhysRegUsed(LaneVGPR);
 
     // Add this register as live-in to all blocks to avoid machine verifer
     // complaining about use of an undefined physical register.
diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
index b086d2ed6652..0a7f684552f0 100644
--- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
@@ -91,6 +91,7 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
 
   if (ScratchOffsetReg != AMDGPU::NoRegister) {
     // Found an SGPR to use
+    MRI.setPhysRegUsed(ScratchOffsetReg);
     BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
             .addReg(ScratchOffsetPreloadReg);
   } else {
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ce4acafac9fa..54c4d549fac7 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -348,7 +348,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
     &AMDGPU::SReg_128RegClass,
     &AMDGPU::VReg_256RegClass,
     &AMDGPU::SReg_256RegClass,
-    &AMDGPU::VReg_512RegClass
+    &AMDGPU::VReg_512RegClass,
+    &AMDGPU::SReg_512RegClass
   };
 
   for (const TargetRegisterClass *BaseClass : BaseClasses) {
@@ -499,7 +500,7 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
 
   for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
        I != E; ++I) {
-    if (MRI.reg_nodbg_empty(*I))
+    if (!MRI.isPhysRegUsed(*I))
       return *I;
   }
   return AMDGPU::NoRegister;
diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td
index 5bf86e649ce0..aca46732adb9 100644
--- a/lib/Target/AMDGPU/VIInstructions.td
+++ b/lib/Target/AMDGPU/VIInstructions.td
@@ -103,4 +103,46 @@ def : Pat <
   (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
 >;
 
+// Patterns for global loads with no offset
+class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (vt (node i64:$addr)),
+  (inst $addr, 0, 0, 0)
+>;
+
+def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
+
+class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (node vt:$data, i64:$addr),
+  (inst $data, $addr, 0, 0, 0)
+>;
+
+def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
+
+class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (vt (node i64:$addr, vt:$data)),
+  (inst $addr, $data, 0, 0)
+>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+
+
 } // End Predicates = [isVI]
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e335784f6d87..8cc06df71633 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -4583,6 +4583,12 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   SDLoc dl(Op);
 
+  if (CmpVT.getVectorElementType() == MVT::i64)
+    // 64-bit comparisons are not legal. We've marked SETCC as non-Custom,
+    // but it's possible that our operands are 64-bit but our result is 32-bit.
+    // Bail in this case.
+    return SDValue();
+
   if (Op1.getValueType().isFloatingPoint()) {
     switch (SetCCOpcode) {
     default: llvm_unreachable("Illegal FP comparison");
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 37352810c99f..265b86f75f1d 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -118,7 +118,6 @@ namespace {
     };
     SpecificBumpPtrAllocator<MergeCandidate> Allocator;
     SmallVector<const MergeCandidate*,4> Candidates;
-    SmallVector<MachineInstr*,4> MergeBaseCandidates;
 
     void moveLiveRegsBefore(const MachineBasicBlock &MBB,
                             MachineBasicBlock::const_iterator Before);
@@ -141,7 +140,6 @@ namespace {
                              MachineBasicBlock::iterator &MBBI);
     bool MergeBaseUpdateLoadStore(MachineInstr *MI);
     bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
-    bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
     bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
     bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
   };
@@ -933,6 +931,11 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
     if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
       CanMergeToLSMulti = false;
 
+    // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
+    // deprecated; LDM to PC is fine but cannot happen here.
+    if (PReg == ARM::SP || PReg == ARM::PC)
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+
     // Merge following instructions where possible.
     for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
       int NewOffset = MemOps[I].Offset;
@@ -940,16 +943,15 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
         break;
       const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
       unsigned Reg = MO.getReg();
-      unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
+      if (Reg == ARM::SP || Reg == ARM::PC)
+        break;
 
       // See if the current load/store may be part of a multi load/store.
+      unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
       bool PartOfLSMulti = CanMergeToLSMulti;
       if (PartOfLSMulti) {
-        // Cannot load from SP
-        if (Reg == ARM::SP)
-          PartOfLSMulti = false;
         // Register numbers must be in ascending order.
-        else if (RegNum <= PRegNum)
+        if (RegNum <= PRegNum)
           PartOfLSMulti = false;
         // For VFP / NEON load/store multiples, the registers must be
         // consecutive and within the limit on the number of registers per
@@ -993,6 +995,76 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
   } while (SIndex < EIndex);
 }
 
+static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
+                                unsigned Bytes, unsigned Limit,
+                                ARMCC::CondCodes Pred, unsigned PredReg) {
+  unsigned MyPredReg = 0;
+  if (!MI)
+    return false;
+
+  bool CheckCPSRDef = false;
+  switch (MI->getOpcode()) {
+  default: return false;
+  case ARM::tSUBi8:
+  case ARM::t2SUBri:
+  case ARM::SUBri:
+    CheckCPSRDef = true;
+    break;
+  case ARM::tSUBspi:
+    break;
+  }
+
+  // Make sure the offset fits in 8 bits.
+  if (Bytes == 0 || (Limit && Bytes >= Limit))
+    return false;
+
+  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi ||
+                    MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME
+  if (!(MI->getOperand(0).getReg() == Base &&
+        MI->getOperand(1).getReg() == Base &&
+        (MI->getOperand(2).getImm() * Scale) == Bytes &&
+        getInstrPredicate(MI, MyPredReg) == Pred &&
+        MyPredReg == PredReg))
+    return false;
+
+  return CheckCPSRDef ? !definesCPSR(MI) : true;
+}
+
+static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
+                                unsigned Bytes, unsigned Limit,
+                                ARMCC::CondCodes Pred, unsigned PredReg) {
+  unsigned MyPredReg = 0;
+  if (!MI)
+    return false;
+
+  bool CheckCPSRDef = false;
+  switch (MI->getOpcode()) {
+  default: return false;
+  case ARM::tADDi8:
+  case ARM::t2ADDri:
+  case ARM::ADDri:
+    CheckCPSRDef = true;
+    break;
+  case ARM::tADDspi:
+    break;
+  }
+
+  if (Bytes == 0 || (Limit && Bytes >= Limit))
+    // Make sure the offset fits in 8 bits.
+    return false;
+
+  unsigned Scale = (MI->getOpcode() == ARM::tADDspi ||
+                    MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME
+  if (!(MI->getOperand(0).getReg() == Base &&
+        MI->getOperand(1).getReg() == Base &&
+        (MI->getOperand(2).getImm() * Scale) == Bytes &&
+        getInstrPredicate(MI, MyPredReg) == Pred &&
+        MyPredReg == PredReg))
+    return false;
+
+  return CheckCPSRDef ? !definesCPSR(MI) : true;
+}
+
 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
                                             ARM_AM::AMSubMode Mode) {
   switch (Opc) {
@@ -1060,75 +1132,6 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
   }
 }
 
-/// Check if the given instruction increments or decrements a register and
-/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
-/// generated by the instruction are possibly read as well.
-static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
-                                  ARMCC::CondCodes Pred, unsigned PredReg) {
-  bool CheckCPSRDef;
-  int Scale;
-  switch (MI.getOpcode()) {
-  case ARM::tADDi8:  Scale =  4; CheckCPSRDef = true; break;
-  case ARM::tSUBi8:  Scale = -4; CheckCPSRDef = true; break;
-  case ARM::t2SUBri:
-  case ARM::SUBri:   Scale = -1; CheckCPSRDef = true; break;
-  case ARM::t2ADDri:
-  case ARM::ADDri:   Scale =  1; CheckCPSRDef = true; break;
-  case ARM::tADDspi: Scale =  4; CheckCPSRDef = false; break;
-  case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
-  default: return 0;
-  }
-
-  unsigned MIPredReg;
-  if (MI.getOperand(0).getReg() != Reg ||
-      MI.getOperand(1).getReg() != Reg ||
-      getInstrPredicate(&MI, MIPredReg) != Pred ||
-      MIPredReg != PredReg)
-    return 0;
-
-  if (CheckCPSRDef && definesCPSR(&MI))
-    return 0;
-  return MI.getOperand(2).getImm() * Scale;
-}
-
-/// Searches for an increment or decrement of \p Reg before \p MBBI.
-static MachineBasicBlock::iterator
-findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
-                 ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
-  Offset = 0;
-  MachineBasicBlock &MBB = *MBBI->getParent();
-  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
-  MachineBasicBlock::iterator EndMBBI = MBB.end();
-  if (MBBI == BeginMBBI)
-    return EndMBBI;
-
-  // Skip debug values.
-  MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
-  while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
-    --PrevMBBI;
-
-  Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
-  return Offset == 0 ? EndMBBI : PrevMBBI;
-}
-
-/// Searches for a increment or decrement of \p Reg after \p MBBI.
-static MachineBasicBlock::iterator
-findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
-                ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
-  Offset = 0;
-  MachineBasicBlock &MBB = *MBBI->getParent();
-  MachineBasicBlock::iterator EndMBBI = MBB.end();
-  MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
-  // Skip debug values.
-  while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
-    ++NextMBBI;
-  if (NextMBBI == EndMBBI)
-    return EndMBBI;
-
-  Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
-  return Offset == 0 ? EndMBBI : NextMBBI;
-}
-
 /// Fold proceeding/trailing inc/dec of base register into the
 /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
 ///
@@ -1148,6 +1151,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   const MachineOperand &BaseOP = MI->getOperand(0);
   unsigned Base = BaseOP.getReg();
   bool BaseKill = BaseOP.isKill();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   unsigned Opcode = MI->getOpcode();
@@ -1159,24 +1163,49 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
     if (MI->getOperand(i).getReg() == Base)
       return false;
 
-  int Bytes = getLSMultipleTransferSize(MI);
+  bool DoMerge = false;
+  ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
+
+  // Try merging with the previous instruction.
   MachineBasicBlock &MBB = *MI->getParent();
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
   MachineBasicBlock::iterator MBBI(MI);
-  int Offset;
-  MachineBasicBlock::iterator MergeInstr
-    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
-  ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
-  if (Mode == ARM_AM::ia && Offset == -Bytes) {
-    Mode = ARM_AM::db;
-  } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
-    Mode = ARM_AM::da;
-  } else {
-    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
-    if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
-        ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes))
-      return false;
+  if (MBBI != BeginMBBI) {
+    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+      --PrevMBBI;
+    if (Mode == ARM_AM::ia &&
+        isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      Mode = ARM_AM::db;
+      DoMerge = true;
+    } else if (Mode == ARM_AM::ib &&
+               isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      Mode = ARM_AM::da;
+      DoMerge = true;
+    }
+    if (DoMerge)
+      MBB.erase(PrevMBBI);
   }
-  MBB.erase(MergeInstr);
+
+  // Try merging with the next instruction.
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (!DoMerge && MBBI != EndMBBI) {
+    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+      ++NextMBBI;
+    if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
+        isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      DoMerge = true;
+    } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
+               isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      DoMerge = true;
+    }
+    if (DoMerge)
+      MBB.erase(NextMBBI);
+  }
+
+  if (!DoMerge)
+    return false;
 
   unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
@@ -1254,6 +1283,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
 
   unsigned Base = getLoadStoreBaseOp(*MI).getReg();
   bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
   unsigned Opcode = MI->getOpcode();
   DebugLoc DL = MI->getDebugLoc();
   bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
@@ -1265,6 +1295,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
   if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
     return false;
 
+  bool isLd = isLoadSingle(Opcode);
   // Can't do the merge if the destination register is the same as the would-be
   // writeback register.
   if (MI->getOperand(0).getReg() == Base)
@@ -1272,31 +1303,55 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
 
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
-  int Bytes = getLSMultipleTransferSize(MI);
+  bool DoMerge = false;
+  ARM_AM::AddrOpc AddSub = ARM_AM::add;
+  unsigned NewOpc = 0;
+  // AM2 - 12 bits, thumb2 - 8 bits.
+  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
+
+  // Try merging with the previous instruction.
   MachineBasicBlock &MBB = *MI->getParent();
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
   MachineBasicBlock::iterator MBBI(MI);
-  int Offset;
-  MachineBasicBlock::iterator MergeInstr
-    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
-  unsigned NewOpc;
-  if (!isAM5 && Offset == Bytes) {
-    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
-  } else if (Offset == -Bytes) {
-    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
-  } else {
-    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
-    if (Offset == Bytes) {
-      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
-    } else if (!isAM5 && Offset == -Bytes) {
-      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
-    } else
-      return false;
+  if (MBBI != BeginMBBI) {
+    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+      --PrevMBBI;
+    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+    } else if (!isAM5 &&
+               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
+      DoMerge = true;
+    }
+    if (DoMerge) {
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
+      MBB.erase(PrevMBBI);
+    }
   }
-  MBB.erase(MergeInstr);
 
-  ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
+  // Try merging with the next instruction.
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (!DoMerge && MBBI != EndMBBI) {
+    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+      ++NextMBBI;
+    if (!isAM5 &&
+        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
+      DoMerge = true;
+    }
+    if (DoMerge) {
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
+      MBB.erase(NextMBBI);
+    }
+  }
+
+  if (!DoMerge)
+    return false;
 
-  bool isLd = isLoadSingle(Opcode);
   if (isAM5) {
     // VLDM[SD]_UPD, VSTM[SD]_UPD
     // (There are no base-updating versions of VLDR/VSTR instructions, but the
@@ -1313,16 +1368,18 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
     if (isAM2) {
       // LDR_PRE, LDR_POST
       if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
+        int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
         BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
           .addReg(Base, RegState::Define)
           .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
       } else {
-        int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+        int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
         BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
           .addReg(Base, RegState::Define)
-          .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+          .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
       }
     } else {
+      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2LDR_PRE, t2LDR_POST
       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
         .addReg(Base, RegState::Define)
@@ -1334,12 +1391,13 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
     // the vestigal zero-reg offset register. When that's fixed, this clause
     // can be removed entirely.
     if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
-      int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+      int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
       // STR_PRE, STR_POST
       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
-        .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
     } else {
+      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2STR_PRE, t2STR_POST
       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
@@ -1351,66 +1409,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
   return true;
 }
 
-bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
-  unsigned Opcode = MI.getOpcode();
-  assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
-         "Must have t2STRDi8 or t2LDRDi8");
-  if (MI.getOperand(3).getImm() != 0)
-    return false;
-
-  // Behaviour for writeback is undefined if base register is the same as one
-  // of the others.
-  const MachineOperand &BaseOp = MI.getOperand(2);
-  unsigned Base = BaseOp.getReg();
-  const MachineOperand &Reg0Op = MI.getOperand(0);
-  const MachineOperand &Reg1Op = MI.getOperand(1);
-  if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
-    return false;
-
-  unsigned PredReg;
-  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
-  MachineBasicBlock::iterator MBBI(MI);
-  MachineBasicBlock &MBB = *MI.getParent();
-  int Offset;
-  MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
-                                                            PredReg, Offset);
-  unsigned NewOpc;
-  if (Offset == 8 || Offset == -8) {
-    NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
-  } else {
-    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
-    if (Offset == 8 || Offset == -8) {
-      NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
-    } else
-      return false;
-  }
-  MBB.erase(MergeInstr);
-
-  DebugLoc DL = MI.getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
-  if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
-    MIB.addOperand(Reg0Op).addOperand(Reg1Op)
-       .addReg(BaseOp.getReg(), RegState::Define);
-  } else {
-    assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
-    MIB.addReg(BaseOp.getReg(), RegState::Define)
-       .addOperand(Reg0Op).addOperand(Reg1Op);
-  }
-  MIB.addReg(BaseOp.getReg(), RegState::Kill)
-     .addImm(Offset).addImm(Pred).addReg(PredReg);
-  assert(TII->get(Opcode).getNumOperands() == 6 &&
-         TII->get(NewOpc).getNumOperands() == 7 &&
-         "Unexpected number of operands in Opcode specification.");
-
-  // Transfer implicit operands.
-  for (const MachineOperand &MO : MI.implicit_operands())
-    MIB.addOperand(MO);
-  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
-  MBB.erase(MBBI);
-  return true;
-}
-
 /// Returns true if instruction is a memory operation that this pass is capable
 /// of operating on.
 static bool isMemoryOp(const MachineInstr *MI) {
@@ -1618,7 +1616,6 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
   ARMCC::CondCodes CurrPred = ARMCC::AL;
   unsigned Position = 0;
   assert(Candidates.size() == 0);
-  assert(MergeBaseCandidates.size() == 0);
   LiveRegsValid = false;
 
   for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
@@ -1697,15 +1694,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       MBBI = I;
       --Position;
       // Fallthrough to look into existing chain.
-    } else if (MBBI->isDebugValue()) {
+    } else if (MBBI->isDebugValue())
       continue;
-    } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
-               MBBI->getOpcode() == ARM::t2STRDi8) {
-      // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
-      // remember them because we may still be able to merge add/sub into them.
-      MergeBaseCandidates.push_back(MBBI);
-    }
-
 
     // If we are here then the chain is broken; Extract candidates for a merge.
     if (MemOps.size() > 0) {
@@ -1736,9 +1726,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       if (Merged) {
         Changed = true;
         unsigned Opcode = Merged->getOpcode();
-        if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
-          MergeBaseUpdateLSDouble(*Merged);
-        else
+        if (Opcode != ARM::t2STRDi8 && Opcode != ARM::t2LDRDi8)
           MergeBaseUpdateLSMultiple(Merged);
       } else {
         for (MachineInstr *MI : Candidate->Instrs) {
@@ -1753,10 +1741,6 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
     }
   }
   Candidates.clear();
-  // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
-  for (MachineInstr *MI : MergeBaseCandidates)
-    MergeBaseUpdateLSDouble(*MI);
-  MergeBaseCandidates.clear();
 
   return Changed;
 }
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 57dc6cb88bed..090a003424a4 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -566,7 +566,7 @@ Robert Muth started working on an alternate jump table implementation that
 does not put the tables in-line in the text.  This is more like the llvm
 default jump table implementation.  This might be useful sometime.  Several
 revisions of patches are on the mailing list, beginning at:
-http://lists.cs.uiuc.edu/pipermail/llvmdev/2009-June/022763.html
+http://lists.llvm.org/pipermail/llvm-dev/2009-June/022763.html
 
 //===---------------------------------------------------------------------===//
 
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 028119c264b3..216e776932be 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -57,7 +57,7 @@ void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // Some things to try that should be better:
     //   * 'mov hi, $src; mov $dst, hi', with hi as either r10 or r11
     //   * 'movs $dst, $src' if cpsr isn't live
-    // See: http://lists.cs.uiuc.edu/pipermail/llvmdev/2014-August/075998.html
+    // See: http://lists.llvm.org/pipermail/llvm-dev/2014-August/075998.html
 
     // 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPUSH)))
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 29283c81877e..21a8996b1159 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -864,13 +864,13 @@ static bool needToReserveScavengingSpillSlots(MachineFunction &MF,
   // Check for an unused caller-saved register.
   for ( ; *CallerSavedRegs; ++CallerSavedRegs) {
     MCPhysReg FreeReg = *CallerSavedRegs;
-    if (!MRI.reg_nodbg_empty(FreeReg))
+    if (MRI.isPhysRegUsed(FreeReg))
       continue;
 
     // Check aliased register usage.
     bool IsCurrentRegUsed = false;
     for (MCRegAliasIterator AI(FreeReg, &HRI, false); AI.isValid(); ++AI)
-      if (!MRI.reg_nodbg_empty(*AI)) {
+      if (MRI.isPhysRegUsed(*AI)) {
         IsCurrentRegUsed = true;
         break;
       }
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index c37cf95cadc3..f917ecad4a53 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -500,14 +500,6 @@ def : MipsPat<(trunc (assertzext GPR64:$src)),
 def : MipsPat<(i32 (trunc GPR64:$src)),
               (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;
 
-// Bypass trunc nodes for bitwise ops.
-def : MipsPat<(i32 (trunc (and GPR64:$lhs, GPR64:$rhs))),
-              (EXTRACT_SUBREG (AND64 GPR64:$lhs, GPR64:$rhs), sub_32)>;
-def : MipsPat<(i32 (trunc (or GPR64:$lhs, GPR64:$rhs))),
-              (EXTRACT_SUBREG (OR64 GPR64:$lhs, GPR64:$rhs), sub_32)>;
-def : MipsPat<(i32 (trunc (xor GPR64:$lhs, GPR64:$rhs))),
-              (EXTRACT_SUBREG (XOR64 GPR64:$lhs, GPR64:$rhs), sub_32)>;
-
 // variable shift instructions patterns
 def : MipsPat<(shl GPR64:$rt, (i32 (trunc GPR64:$rs))),
               (DSLLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index e2f6fcc17726..5152a072b3a2 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -267,6 +267,9 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
 }
 
 unsigned MipsFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+  if (!TargetSupported)
+    return 0;
+
   assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i32 &&
          "Alloca should always return a pointer.");
 
@@ -290,12 +293,7 @@ unsigned MipsFastISel::materializeInt(const Constant *C, MVT VT) {
     return 0;
   const TargetRegisterClass *RC = &Mips::GPR32RegClass;
   const ConstantInt *CI = cast<ConstantInt>(C);
-  int64_t Imm;
-  if ((VT != MVT::i1) && CI->isNegative())
-    Imm = CI->getSExtValue();
-  else
-    Imm = CI->getZExtValue();
-  return materialize32BitInt(Imm, RC);
+  return materialize32BitInt(CI->getZExtValue(), RC);
 }
 
 unsigned MipsFastISel::materialize32BitInt(int64_t Imm,
@@ -382,6 +380,9 @@ unsigned MipsFastISel::materializeExternalCallSym(MCSymbol *Sym) {
 // Materialize a constant into a register, and return the register
 // number (or zero if we failed to handle it).
 unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) {
+  if (!TargetSupported)
+    return 0;
+
   EVT CEVT = TLI.getValueType(DL, C->getType(), true);
 
   // Only handle simple types.
@@ -981,6 +982,13 @@ bool MipsFastISel::selectSelect(const Instruction *I) {
   if (!Src1Reg || !Src2Reg || !CondReg)
     return false;
 
+  unsigned ZExtCondReg = createResultReg(&Mips::GPR32RegClass);
+  if (!ZExtCondReg)
+    return false;
+
+  if (!emitIntExt(MVT::i1, CondReg, MVT::i32, ZExtCondReg, true))
+    return false;
+
   unsigned ResultReg = createResultReg(RC);
   unsigned TempReg = createResultReg(RC);
 
@@ -989,7 +997,7 @@ bool MipsFastISel::selectSelect(const Instruction *I) {
 
   emitInst(TargetOpcode::COPY, TempReg).addReg(Src2Reg);
   emitInst(CondMovOpc, ResultReg)
-    .addReg(Src1Reg).addReg(CondReg).addReg(TempReg);
+    .addReg(Src1Reg).addReg(ZExtCondReg).addReg(TempReg);
   updateValueMap(I, ResultReg);
   return true;
 }
@@ -1232,12 +1240,19 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
 }
 
 bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
+  if (!TargetSupported)
+    return false;
+
   CallingConv::ID CC = CLI.CallConv;
   bool IsTailCall = CLI.IsTailCall;
   bool IsVarArg = CLI.IsVarArg;
   const Value *Callee = CLI.Callee;
   MCSymbol *Symbol = CLI.Symbol;
 
+  // Do not handle FastCC.
+  if (CC == CallingConv::Fast)
+    return false;
+
   // Allow SelectionDAG isel to handle tail calls.
   if (IsTailCall)
     return false;
@@ -1312,6 +1327,9 @@ bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
 }
 
 bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+  if (!TargetSupported)
+    return false;
+
   switch (II->getIntrinsicID()) {
   default:
     return false;
@@ -1415,6 +1433,11 @@ bool MipsFastISel::selectRet(const Instruction *I) {
 
   if (Ret->getNumOperands() > 0) {
     CallingConv::ID CC = F.getCallingConv();
+
+    // Do not handle FastCC.
+    if (CC == CallingConv::Fast)
+      return false;
+
     SmallVector<ISD::OutputArg, 4> Outs;
     GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
 
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index fbebb9abb4cc..fab2fdfef8cf 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
@@ -53,11 +54,6 @@ NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
                cl::desc("MIPS: Don't trap on integer division by zero."),
                cl::init(false));
 
-cl::opt<bool>
-EnableMipsFastISel("mips-fast-isel", cl::Hidden,
-  cl::desc("Allow mips-fast-isel to be used"),
-  cl::init(false));
-
 static const MCPhysReg Mips64DPRegs[8] = {
   Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
   Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
@@ -461,7 +457,7 @@ const MipsTargetLowering *MipsTargetLowering::create(const MipsTargetMachine &TM
 FastISel *
 MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                   const TargetLibraryInfo *libInfo) const {
-  if (!EnableMipsFastISel)
+  if (!funcInfo.MF->getTarget().Options.EnableFastISel)
     return TargetLowering::createFastISel(funcInfo, libInfo);
   return Mips::createFastISel(funcInfo, libInfo);
 }
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 4799ea27c4b4..93a503c3758d 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -12,6 +12,7 @@
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -22,10 +23,12 @@ typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
 class PPCDisassembler : public MCDisassembler {
+  bool IsLittleEndian;
+
 public:
-  PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
-    : MCDisassembler(STI, Ctx) {}
-  ~PPCDisassembler() override {}
+  PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                  bool IsLittleEndian)
+      : MCDisassembler(STI, Ctx), IsLittleEndian(IsLittleEndian) {}
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -37,7 +40,13 @@ public:
 static MCDisassembler *createPPCDisassembler(const Target &T,
                                              const MCSubtargetInfo &STI,
                                              MCContext &Ctx) {
-  return new PPCDisassembler(STI, Ctx);
+  return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/false);
+}
+
+static MCDisassembler *createPPCLEDisassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/true);
 }
 
 extern "C" void LLVMInitializePowerPCDisassembler() {
@@ -47,7 +56,7 @@ extern "C" void LLVMInitializePowerPCDisassembler() {
   TargetRegistry::RegisterMCDisassembler(ThePPC64Target,
                                          createPPCDisassembler);
   TargetRegistry::RegisterMCDisassembler(ThePPC64LETarget,
-                                         createPPCDisassembler);
+                                         createPPCLEDisassembler);
 }
 
 // FIXME: These can be generated by TableGen from the existing register
@@ -383,9 +392,9 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return MCDisassembler::Fail;
   }
 
-  // The instruction is big-endian encoded.
-  uint32_t Inst =
-      (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 8) | (Bytes[3] << 0);
+  // Read the instruction in the proper endianness.
+  uint32_t Inst = IsLittleEndian ? support::endian::read32le(Bytes.data())
+                                 : support::endian::read32be(Bytes.data());
 
   if (STI.getFeatureBits()[PPC::FeatureQPX]) {
     DecodeStatus result =
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 199a0debf88b..444446692c58 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -363,71 +363,85 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
   SM.recordPatchPoint(MI);
   PatchPointOpers Opers(&MI);
 
-  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
   unsigned EncodedBytes = 0;
-  if (CallTarget) {
-    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
-           "High 16 bits of call target should be zero.");
-    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
-    EncodedBytes = 0;
-    // Materialize the jump address:
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 32) & 0xFFFF));
-    ++EncodedBytes;
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm(32).addImm(16));
-    ++EncodedBytes;
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 16) & 0xFFFF));
-    ++EncodedBytes;
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm(CallTarget & 0xFFFF));
-
-    // Save the current TOC pointer before the remote call.
-    int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD)
-                                    .addReg(PPC::X2)
-                                    .addImm(TOCSaveOffset)
-                                    .addReg(PPC::X1));
-    ++EncodedBytes;
-
-
-    // If we're on ELFv1, then we need to load the actual function pointer from
-    // the function descriptor.
-    if (!Subtarget->isELFv2ABI()) {
-      // Load the new TOC pointer and the function address, but not r11
-      // (needing this is rare, and loading it here would prevent passing it
-      // via a 'nest' parameter.
-      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+  const MachineOperand &CalleeMO =
+    Opers.getMetaOper(PatchPointOpers::TargetPos);
+
+  if (CalleeMO.isImm()) {
+    int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+    if (CallTarget) {
+      assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+             "High 16 bits of call target should be zero.");
+      unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+      EncodedBytes = 0;
+      // Materialize the jump address:
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8)
+                                      .addReg(ScratchReg)
+                                      .addImm((CallTarget >> 32) & 0xFFFF));
+      ++EncodedBytes;
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC)
+                                      .addReg(ScratchReg)
+                                      .addReg(ScratchReg)
+                                      .addImm(32).addImm(16));
+      ++EncodedBytes;
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8)
+                                      .addReg(ScratchReg)
+                                      .addReg(ScratchReg)
+                                      .addImm((CallTarget >> 16) & 0xFFFF));
+      ++EncodedBytes;
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8)
+                                      .addReg(ScratchReg)
+                                      .addReg(ScratchReg)
+                                      .addImm(CallTarget & 0xFFFF));
+
+      // Save the current TOC pointer before the remote call.
+      int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD)
                                       .addReg(PPC::X2)
-                                      .addImm(8)
+                                      .addImm(TOCSaveOffset)
+                                      .addReg(PPC::X1));
+      ++EncodedBytes;
+
+
+      // If we're on ELFv1, then we need to load the actual function pointer
+      // from the function descriptor.
+      if (!Subtarget->isELFv2ABI()) {
+	// Load the new TOC pointer and the function address, but not r11
+	// (needing this is rare, and loading it here would prevent passing it
+	// via a 'nest' parameter.
+        EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+                                        .addReg(PPC::X2)
+                                        .addImm(8)
+                                        .addReg(ScratchReg));
+        ++EncodedBytes;
+        EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+                                        .addReg(ScratchReg)
+                                        .addImm(0)
+                                        .addReg(ScratchReg));
+        ++EncodedBytes;
+      }
+
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8)
                                       .addReg(ScratchReg));
       ++EncodedBytes;
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8));
+      ++EncodedBytes;
+
+      // Restore the TOC pointer after the call.
       EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
-                                      .addReg(ScratchReg)
-                                      .addImm(0)
-                                      .addReg(ScratchReg));
+                                      .addReg(PPC::X2)
+                                      .addImm(TOCSaveOffset)
+                                      .addReg(PPC::X1));
       ++EncodedBytes;
     }
+  } else if (CalleeMO.isGlobal()) {
+    const GlobalValue *GValue = CalleeMO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, OutContext);
 
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8).addReg(ScratchReg));
-    ++EncodedBytes;
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8));
-    ++EncodedBytes;
-
-    // Restore the TOC pointer after the call.
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
-                                    .addReg(PPC::X2)
-                                    .addImm(TOCSaveOffset)
-                                    .addReg(PPC::X1));
-    ++EncodedBytes;
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL8_NOP)
+                                    .addExpr(SymVar));
+    EncodedBytes += 2;
   }
 
   // Each instruction is 4 bytes.
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 87229d80d9c1..08ae7174244a 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -306,10 +306,9 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   DebugLoc dl = MI->getDebugLoc();
 
-  const MachineRegisterInfo &MRI = MF->getRegInfo();
   unsigned UsedRegMask = 0;
   for (unsigned i = 0; i != 32; ++i)
-    if (MRI.isPhysRegModified(VRRegNo[i]))
+    if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i]))
       UsedRegMask |= 1 << (31-i);
 
   // Live in and live out values already must be in the mask, so don't bother
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 01a3acb742e6..b6025bf66ef7 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -2305,14 +2305,15 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
     if (Swap)
       std::swap(LHS, RHS);
 
+    EVT ResVT = VecVT.changeVectorElementTypeToInteger();
     if (Negate) {
-      SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
+      SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0);
       return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR :
                                                               PPC::VNOR,
-                                  VecVT, VCmp, VCmp);
+                                  ResVT, VCmp, VCmp);
     }
 
-    return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
+    return CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS);
   }
 
   if (PPCSubTarget->useCRBits())
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 0ed9b051ffed..1e28913d1fca 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -580,6 +580,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
       addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);
 
+      addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
       addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
       addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);
 
@@ -1416,7 +1417,7 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
   } else
     return -1;
 
-  if (ShuffleKind == 2 && isLE)
+  if (isLE)
     ShiftAmt = 16 - ShiftAmt;
 
   return ShiftAmt;
@@ -1429,6 +1430,11 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
   assert(N->getValueType(0) == MVT::v16i8 &&
          (EltSize == 1 || EltSize == 2 || EltSize == 4));
 
+  // The consecutive indices need to specify an element, not part of two
+  // different elements.  So abandon ship early if this isn't the case.
+  if (N->getMaskElt(0) % EltSize != 0)
+    return false;
+
   // This is a splat operation if each element of the permute is the same, and
   // if the value doesn't reference the second vector.
   unsigned ElementBase = N->getMaskElt(0);
@@ -7011,17 +7017,20 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // t = vsplti c, result = vsldoi t, t, 1
     if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
-      return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
+      unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
+      return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 2
     if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
-      return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
+      unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
+      return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 3
     if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
-      return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
+      unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
+      return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
     }
   }
 
@@ -9957,6 +9966,9 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
     if (Src.getValueType() == MVT::f32) {
       Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
       DCI.AddToWorklist(Src.getNode());
+    } else if (Src.getValueType() != MVT::f64) {
+      // Make sure that we don't pick up a ppc_fp128 source value.
+      return SDValue();
     }
 
     unsigned FCTOp =
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 282d9234c1a5..7e9888cc13e8 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -106,7 +106,7 @@ for 1,2,4,8 bytes.
 //===---------------------------------------------------------------------===//
 
 It would be nice to revert this patch:
-http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
+http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 
 And teach the dag combiner enough to simplify the code expanded before 
 legalize.  It seems plausible that this knowledge would let it simplify other
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 8fa10dcae114..c0279daa63d9 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -190,11 +190,11 @@ static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
 {
 
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
-    if (!MRI->reg_nodbg_empty(reg))
+    if (MRI->isPhysRegUsed(reg))
       return false;
 
   for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
-    if (!MRI->reg_nodbg_empty(reg))
+    if (MRI->isPhysRegUsed(reg))
       return false;
 
   return true;
@@ -206,10 +206,10 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineFrameInfo    *MFI = MF.getFrameInfo();
 
-  return !(MFI->hasCalls()                 // has calls
-           || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed
-           || !MRI.reg_nodbg_empty(SP::O6) // %SP is used
-           || hasFP(MF));                  // need %FP
+  return !(MFI->hasCalls()              // has calls
+           || MRI.isPhysRegUsed(SP::L0) // Too many registers needed
+           || MRI.isPhysRegUsed(SP::O6) // %SP is used
+           || hasFP(MF));               // need %FP
 }
 
 void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
@@ -218,13 +218,16 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
 
   // Remap %i[0-7] to %o[0-7].
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
-    if (MRI.reg_nodbg_empty(reg))
+    if (!MRI.isPhysRegUsed(reg))
       continue;
     unsigned mapped_reg = (reg - SP::I0 + SP::O0);
-    assert(MRI.reg_nodbg_empty(mapped_reg));
+    assert(!MRI.isPhysRegUsed(mapped_reg));
 
     // Replace I register with O register.
     MRI.replaceRegWith(reg, mapped_reg);
+
+    // Mark the reg unused.
+    MRI.setPhysRegUnused(reg);
   }
 
   // Rewrite MBB's Live-ins.
diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
index be8f00b57adb..bdd1b1598adb 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/lib/Target/SystemZ/SystemZCallingConv.td
@@ -53,10 +53,6 @@ def RetCC_SystemZ : CallingConv<[
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
              CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
-
-  // ABI-compliant code returns long double by reference, but that conversion
-  // is left to higher-level code.  Perhaps we could add an f128 definition
-  // here for code that doesn't care about the ABI?
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 056ee02dcc21..9a753c897519 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1175,6 +1175,20 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   return Chain;
 }
 
+bool SystemZTargetLowering::
+CanLowerReturn(CallingConv::ID CallConv,
+               MachineFunction &MF, bool isVarArg,
+               const SmallVectorImpl<ISD::OutputArg> &Outs,
+               LLVMContext &Context) const {
+  // Detect unsupported vector return types.
+  if (Subtarget.hasVector())
+    VerifyVectorTypes(Outs);
+
+  SmallVector<CCValAssign, 16> RetLocs;
+  CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
+  return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
+}
+
 SDValue
 SystemZTargetLowering::LowerReturn(SDValue Chain,
                                    CallingConv::ID CallConv, bool IsVarArg,
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 949b67f114ea..07ff25144581 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -423,6 +423,10 @@ public:
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals,
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 418f0431e1d8..bca059d8c383 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -681,6 +681,9 @@ private:
 
   std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
   std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
+  void AddDefaultSrcDestOperands(
+      OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+      std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
   std::unique_ptr<X86Operand> ParseOperand();
   std::unique_ptr<X86Operand> ParseATTOperand();
   std::unique_ptr<X86Operand> ParseIntelOperand();
@@ -1014,6 +1017,19 @@ std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
                                Loc, Loc, 0);
 }
 
+void X86AsmParser::AddDefaultSrcDestOperands(
+    OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+    std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst) {
+  if (isParsingIntelSyntax()) {
+    Operands.push_back(std::move(Dst));
+    Operands.push_back(std::move(Src));
+  }
+  else {
+    Operands.push_back(std::move(Src));
+    Operands.push_back(std::move(Dst));
+  }
+}
+
 std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
   if (isParsingIntelSyntax())
     return ParseIntelOperand();
@@ -2228,26 +2244,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   if (Name.startswith("ins") && Operands.size() == 1 &&
       (Name == "insb" || Name == "insw" || Name == "insl" ||
        Name == "insd" )) {
-    if (isParsingIntelSyntax()) {
-      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
-      Operands.push_back(DefaultMemDIOperand(NameLoc));
-    } else {
-      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
-      Operands.push_back(DefaultMemDIOperand(NameLoc));
-    }
+    AddDefaultSrcDestOperands(Operands, 
+                              X86Operand::CreateReg(X86::DX, NameLoc, NameLoc),
+                              DefaultMemDIOperand(NameLoc));
   }
 
   // Append default arguments to "outs[bwld]"
   if (Name.startswith("outs") && Operands.size() == 1 &&
       (Name == "outsb" || Name == "outsw" || Name == "outsl" ||
        Name == "outsd" )) {
-    if (isParsingIntelSyntax()) {
-      Operands.push_back(DefaultMemSIOperand(NameLoc));
-      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
-    } else {
-      Operands.push_back(DefaultMemSIOperand(NameLoc));
-      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
-    }
+    AddDefaultSrcDestOperands(Operands,
+                              DefaultMemSIOperand(NameLoc),
+                              X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
   }
 
   // Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate
@@ -2279,13 +2287,9 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       (Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" ||
        Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) {
     if (Operands.size() == 1) {
-      if (isParsingIntelSyntax()) {
-        Operands.push_back(DefaultMemSIOperand(NameLoc));
-        Operands.push_back(DefaultMemDIOperand(NameLoc));
-      } else {
-        Operands.push_back(DefaultMemDIOperand(NameLoc));
-        Operands.push_back(DefaultMemSIOperand(NameLoc));
-      }
+      AddDefaultSrcDestOperands(Operands,
+                                DefaultMemDIOperand(NameLoc),
+                                DefaultMemSIOperand(NameLoc));
     } else if (Operands.size() == 3) {
       X86Operand &Op = (X86Operand &)*Operands[1];
       X86Operand &Op2 = (X86Operand &)*Operands[2];
@@ -2305,13 +2309,9 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     if (Operands.size() == 1) {
       if (Name == "movsd")
         Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
-      if (isParsingIntelSyntax()) {
-        Operands.push_back(DefaultMemDIOperand(NameLoc));
-        Operands.push_back(DefaultMemSIOperand(NameLoc));
-      } else {
-        Operands.push_back(DefaultMemSIOperand(NameLoc));
-        Operands.push_back(DefaultMemDIOperand(NameLoc));
-      }
+      AddDefaultSrcDestOperands(Operands,
+                                DefaultMemSIOperand(NameLoc),
+                                DefaultMemDIOperand(NameLoc));
     } else if (Operands.size() == 3) {
       X86Operand &Op = (X86Operand &)*Operands[1];
       X86Operand &Op2 = (X86Operand &)*Operands[2];
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 36a8cdbab55b..40b9c8a863a3 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -301,9 +301,8 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
   bool FPIsUsed = false;
 
   static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned i = 0; i <= 6; ++i)
-    if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
+    if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
       FPIsUsed = true;
       break;
     }
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 2a35c4cf31f3..3a21b57f0157 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1682,6 +1682,8 @@ void X86FrameLowering::adjustForSegmentedStacks(
       .addImm(StackSize);
     BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
       .addImm(X86FI->getArgumentStackSize());
+    MF.getRegInfo().setPhysRegUsed(Reg10);
+    MF.getRegInfo().setPhysRegUsed(Reg11);
   } else {
     BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
       .addImm(X86FI->getArgumentStackSize());
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6e22ab30057c..71ccb1ab1e55 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12640,24 +12640,29 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
       if (User->getOpcode() == ISD::FNEG)
         return Op;
 
-  SDValue Op0 = Op.getOperand(0);
-  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
-
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
-  // Assume scalar op for initialization; update for vector if needed.
-  // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
-  // generate a 16-byte vector constant and logic op even for the scalar case.
-  // Using a 16-byte mask allows folding the load of the mask with
-  // the logic op, so it can save (~4 bytes) on code size.
-  MVT EltVT = VT;
-  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+
   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
   // decide if we should generate a 16-byte constant mask when we only need 4 or
   // 8 bytes for the scalar case.
+
+  MVT LogicVT;
+  MVT EltVT;
+  unsigned NumElts;
+  
   if (VT.isVector()) {
+    LogicVT = VT;
     EltVT = VT.getVectorElementType();
     NumElts = VT.getVectorNumElements();
+  } else {
+    // There are no scalar bitwise logical SSE/AVX instructions, so we
+    // generate a 16-byte vector constant and logic op even for the scalar case.
+    // Using a 16-byte mask allows folding the load of the mask with
+    // the logic op, so it can save (~4 bytes) on code size.
+    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+    EltVT = VT;
+    NumElts = (VT == MVT::f64) ? 2 : 4;
   }
 
   unsigned EltBits = EltVT.getSizeInBits();
@@ -12670,26 +12675,25 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+  SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
                              false, false, false, Alignment);
 
-  if (VT.isVector()) {
-    // For a vector, cast operands to a vector type, perform the logic op,
-    // and cast the result back to the original value type.
-    MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-    SDValue MaskCasted = DAG.getBitcast(VecVT, Mask);
-    SDValue Operand = IsFNABS ? DAG.getBitcast(VecVT, Op0.getOperand(0))
-                              : DAG.getBitcast(VecVT, Op0);
-    unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
-    return DAG.getBitcast(VT,
-                          DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
-  }
-
-  // If not vector, then scalar.
-  unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+  SDValue Op0 = Op.getOperand(0);
+  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+  unsigned LogicOp =
+    IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
-  return DAG.getNode(BitOp, dl, VT, Operand, Mask);
+
+  if (VT.isVector())
+    return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+
+  // For the scalar case extend to a 128-bit vector, perform the logic op,
+  // and extract the scalar result back out.
+  Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
+  SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
+                     DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -12729,10 +12733,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   Constant *C = ConstantVector::get(CV);
   auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
-  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+
+  // Perform all logic operations as 16-byte vectors because there are no
+  // scalar FP logic instructions in SSE. This allows load folding of the
+  // constants into the logic instructions.
+  MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+  SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                               MachinePointerInfo::getConstantPool(),
                               false, false, false, 16);
-  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+  Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
 
   // Next, clear the sign bit from the first operand (magnitude).
   // If it's a constant, we can clear it here.
@@ -12740,7 +12750,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
     APFloat APF = Op0CN->getValueAPF();
     // If the magnitude is a positive zero, the sign bit alone is enough.
     if (APF.isPosZero())
-      return SignBit;
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+                         DAG.getIntPtrConstant(0, dl));
     APF.clearSign();
     CV[0] = ConstantFP::get(*Context, APF);
   } else {
@@ -12750,15 +12761,18 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   }
   C = ConstantVector::get(CV);
   CPIdx = DAG.getConstantPool(C, PtrVT, 16);
-  SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+  SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                             MachinePointerInfo::getConstantPool(),
                             false, false, false, 16);
   // If the magnitude operand wasn't a constant, we need to AND out the sign.
-  if (!isa<ConstantFPSDNode>(Op0))
-    Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
-
+  if (!isa<ConstantFPSDNode>(Op0)) {
+    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+    Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
+  }
   // OR the magnitude value with the sign bit.
-  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+  Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+                     DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 786150760b93..cf68ef053361 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -956,18 +956,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::DPPDrri,         X86::DPPDrmi,       TB_ALIGN_16 },
     { X86::DPPSrri,         X86::DPPSrmi,       TB_ALIGN_16 },
 
-    // FIXME: We should not be folding Fs* scalar loads into vector
-    // instructions because the vector instructions require vector-sized
-    // loads. Lowering should create vector-sized instructions (the Fv*
-    // variants below) to allow load folding.
-    { X86::FsANDNPDrr,      X86::FsANDNPDrm,    TB_ALIGN_16 },
-    { X86::FsANDNPSrr,      X86::FsANDNPSrm,    TB_ALIGN_16 },
-    { X86::FsANDPDrr,       X86::FsANDPDrm,     TB_ALIGN_16 },
-    { X86::FsANDPSrr,       X86::FsANDPSrm,     TB_ALIGN_16 },
-    { X86::FsORPDrr,        X86::FsORPDrm,      TB_ALIGN_16 },
-    { X86::FsORPSrr,        X86::FsORPSrm,      TB_ALIGN_16 },
-    { X86::FsXORPDrr,       X86::FsXORPDrm,     TB_ALIGN_16 },
-    { X86::FsXORPSrr,       X86::FsXORPSrm,     TB_ALIGN_16 },
+    // Do not fold Fs* scalar logical op loads because there are no scalar
+    // load variants for these instructions. When folded, the load is required
+    // to be 128-bits, so the load size would not match.
 
     { X86::FvANDNPDrr,      X86::FvANDNPDrm,    TB_ALIGN_16 },
     { X86::FvANDNPSrr,      X86::FvANDNPSrm,    TB_ALIGN_16 },
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index a5ff9edf05a3..99386b0658ad 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2919,6 +2919,14 @@ multiclass sse12_fp_packed_vector_logical_alias<
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
         VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
         PD, VEX_4V;
+
+  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+        VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
+        PS, VEX_4V, VEX_L;
+        
+  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+        VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
+        PD, VEX_4V, VEX_L;
   }
 
   let Constraints = "$src1 = $dst" in {
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index ee21c81fa267..15e0889b51b7 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -93,7 +93,8 @@ static Value *getFCmpValue(bool isordered, unsigned code,
   case 5: Pred = isordered ? FCmpInst::FCMP_ONE : FCmpInst::FCMP_UNE; break;
   case 6: Pred = isordered ? FCmpInst::FCMP_OLE : FCmpInst::FCMP_ULE; break;
   case 7:
-    if (!isordered) return ConstantInt::getTrue(LHS->getContext());
+    if (!isordered)
+      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
     Pred = FCmpInst::FCMP_ORD; break;
   }
   return Builder->CreateFCmp(Pred, LHS, RHS);
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 0bd6fd2f226d..95bba3c7af7d 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2112,9 +2112,8 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
 bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
                                          Value *RHS, Instruction &OrigI,
                                          Value *&Result, Constant *&Overflow) {
-  assert((!OrigI.isCommutative() ||
-          !(isa<Constant>(LHS) && !isa<Constant>(RHS))) &&
-         "call with a constant RHS if possible!");
+  if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS))
+    std::swap(LHS, RHS);
 
   auto SetResult = [&](Value *OpResult, Constant *OverflowVal, bool ReuseName) {
     Result = OpResult;
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index d536a937dce1..029b44c2ea80 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -658,7 +658,7 @@ bool EarlyCSE::run() {
   // gains over vector when the container becomes very large due to the
   // specific access patterns. For more information see the mailing list
   // discussion on this:
-  // http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
+  // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
   std::deque<StackNode *> nodesToProcess;
 
   bool Changed = false;
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index d1a0a82b9b08..947513a36572 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -1847,10 +1847,17 @@ static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
   if (OldTy == NewTy)
     return true;
-  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
-    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
-      if (NewITy->getBitWidth() >= OldITy->getBitWidth())
-        return true;
+
+  // For integer types, we can't handle any bit-width differences. This would
+  // break both vector conversions with extension and introduce endianness
+  // issues when in conjunction with loads and stores.
+  if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
+    assert(cast<IntegerType>(OldTy)->getBitWidth() !=
+               cast<IntegerType>(NewTy)->getBitWidth() &&
+           "We can't have the same bitwidth for different int types");
+    return false;
+  }
+
   if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
     return false;
   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
@@ -1885,10 +1892,8 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
   if (OldTy == NewTy)
     return V;
 
-  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
-    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
-      if (NewITy->getBitWidth() > OldITy->getBitWidth())
-        return IRB.CreateZExt(V, NewITy);
+  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
+         "Integer types must be the exact same to convert.");
 
   // See if we need inttoptr for this type pair. A cast involving both scalars
   // and vectors requires and additional bitcast.
@@ -2134,6 +2139,9 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
   if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
     if (LI->isVolatile())
       return false;
+    // We can't handle loads that extend past the allocated memory.
+    if (DL.getTypeStoreSize(LI->getType()) > Size)
+      return false;
     // Note that we don't count vector loads or stores as whole-alloca
     // operations which enable integer widening because we would prefer to use
     // vector widening instead.
@@ -2152,6 +2160,9 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     Type *ValueTy = SI->getValueOperand()->getType();
     if (SI->isVolatile())
       return false;
+    // We can't handle stores that extend past the allocated memory.
+    if (DL.getTypeStoreSize(ValueTy) > Size)
+      return false;
     // Note that we don't count vector loads or stores as whole-alloca
     // operations which enable integer widening because we would prefer to use
     // vector widening instead.
@@ -2585,6 +2596,7 @@ private:
 
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                              : LI.getType();
+    const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
     bool IsPtrAdjusted = false;
     Value *V;
     if (VecTy) {
@@ -2592,13 +2604,27 @@ private:
     } else if (IntTy && LI.getType()->isIntegerTy()) {
       V = rewriteIntegerLoad(LI);
     } else if (NewBeginOffset == NewAllocaBeginOffset &&
-               canConvertValue(DL, NewAllocaTy, LI.getType())) {
+               NewEndOffset == NewAllocaEndOffset &&
+               (canConvertValue(DL, NewAllocaTy, TargetTy) ||
+                (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
+                 TargetTy->isIntegerTy()))) {
       LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                               LI.isVolatile(), LI.getName());
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
-
       V = NewLI;
+
+      // If this is an integer load past the end of the slice (which means the
+      // bytes outside the slice are undef or this load is dead) just forcibly
+      // fix the integer size with correct handling of endianness.
+      if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+        if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
+          if (AITy->getBitWidth() < TITy->getBitWidth()) {
+            V = IRB.CreateZExt(V, TITy, "load.ext");
+            if (DL.isBigEndian())
+              V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
+                                "endian_shift");
+          }
     } else {
       Type *LTy = TargetTy->getPointerTo();
       LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
@@ -2718,10 +2744,25 @@ private:
     if (IntTy && V->getType()->isIntegerTy())
       return rewriteIntegerStore(V, SI);
 
+    const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
     StoreInst *NewSI;
     if (NewBeginOffset == NewAllocaBeginOffset &&
         NewEndOffset == NewAllocaEndOffset &&
-        canConvertValue(DL, V->getType(), NewAllocaTy)) {
+        (canConvertValue(DL, V->getType(), NewAllocaTy) ||
+         (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
+          V->getType()->isIntegerTy()))) {
+      // If this is an integer store past the end of slice (and thus the bytes
+      // past that point are irrelevant or this is unreachable), truncate the
+      // value prior to storing.
+      if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
+        if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+          if (VITy->getBitWidth() > AITy->getBitWidth()) {
+            if (DL.isBigEndian())
+              V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
+                                 "endian_shift");
+            V = IRB.CreateTrunc(V, AITy, "load.trunc");
+          }
+
       V = convertValue(DL, IRB, V, NewAllocaTy);
       NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                      SI.isVolatile());
diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp
index d55dc6a20a06..049300350857 100644
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@@ -227,10 +227,16 @@ Value *Scatterer::operator[](unsigned I) {
       if (!Idx)
         break;
       unsigned J = Idx->getZExtValue();
-      CV[J] = Insert->getOperand(1);
       V = Insert->getOperand(0);
-      if (I == J)
+      if (I == J) {
+        CV[J] = Insert->getOperand(1);
         return CV[J];
+      } else if (!CV[J]) {
+        // Only cache the first entry we find for each index we're not actively
+        // searching for. This prevents us from going too far up the chain and
+        // caching incorrect entries.
+        CV[J] = Insert->getOperand(1);
+      }
     }
     CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I),
                                          V->getName() + ".i" + Twine(I));
diff --git a/test/Analysis/BasicAA/gep-alias.ll b/test/Analysis/BasicAA/gep-alias.ll
index f686010f9ead..1e435af2f12f 100644
--- a/test/Analysis/BasicAA/gep-alias.ll
+++ b/test/Analysis/BasicAA/gep-alias.ll
@@ -228,3 +228,51 @@ define i32 @test12(i32 %x, i32 %y, i8* %p) nounwind {
 ; CHECK-LABEL: @test12(
 ; CHECK: ret i32 %r
 }
+
+@P = internal global i32 715827882, align 4
+@Q = internal global i32 715827883, align 4
+@.str = private unnamed_addr constant [7 x i8] c"%u %u\0A\00", align 1
+
+; Make sure we recognize that u[0] and u[Global + Cst] may alias
+; when the addition has wrapping semantic.
+; PR24468.
+; CHECK-LABEL: @test13(
+; Make sure the stores appear before the related loads.
+; CHECK: store i8 42,
+; CHECK: store i8 99,
+; Find the loads and make sure they are used in the arguments to the printf.
+; CHECK: [[T0ADDR:%[a-zA-Z0-9_]+]] = getelementptr inbounds [3 x i8], [3 x i8]* %t, i32 0, i32 0
+; CHECK: [[T0:%[a-zA-Z0-9_]+]] = load i8, i8* [[T0ADDR]], align 1
+; CHECK: [[T0ARG:%[a-zA-Z0-9_]+]] = zext i8 [[T0]] to i32
+; CHECK: [[U0ADDR:%[a-zA-Z0-9_]+]] = getelementptr inbounds [3 x i8], [3 x i8]* %u, i32 0, i32 0
+; CHECK: [[U0:%[a-zA-Z0-9_]+]] = load i8, i8* [[U0ADDR]], align 1
+; CHECK: [[U0ARG:%[a-zA-Z0-9_]+]] = zext i8 [[U0]] to i32
+; CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 [[T0ARG]], i32 [[U0ARG]])
+; CHECK: ret
+define void @test13() {
+entry:
+  %t = alloca [3 x i8], align 1
+  %u = alloca [3 x i8], align 1
+  %tmp = load i32, i32* @P, align 4
+  %tmp1 = mul i32 %tmp, 3
+  %mul = add i32 %tmp1, -2147483646
+  %idxprom = zext i32 %mul to i64
+  %arrayidx = getelementptr inbounds [3 x i8], [3 x i8]* %t, i64 0, i64 %idxprom
+  store i8 42, i8* %arrayidx, align 1
+  %tmp2 = load i32, i32* @Q, align 4
+  %tmp3 = mul i32 %tmp2, 3
+  %mul2 = add i32 %tmp3, 2147483647
+  %idxprom3 = zext i32 %mul2 to i64
+  %arrayidx4 = getelementptr inbounds [3 x i8], [3 x i8]* %u, i64 0, i64 %idxprom3
+  store i8 99, i8* %arrayidx4, align 1
+  %arrayidx5 = getelementptr inbounds [3 x i8], [3 x i8]* %t, i64 0, i64 0
+  %tmp4 = load i8, i8* %arrayidx5, align 1
+  %conv = zext i8 %tmp4 to i32
+  %arrayidx6 = getelementptr inbounds [3 x i8], [3 x i8]* %u, i64 0, i64 0
+  %tmp5 = load i8, i8* %arrayidx6, align 1
+  %conv7 = zext i8 %tmp5 to i32
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i64 0, i64 0), i32 %conv, i32 %conv7)
+  ret void
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/Analysis/BasicAA/phi-aa.ll b/test/Analysis/BasicAA/phi-aa.ll
index 3944e9e43566..a72778277bb2 100644
--- a/test/Analysis/BasicAA/phi-aa.ll
+++ b/test/Analysis/BasicAA/phi-aa.ll
@@ -39,7 +39,6 @@ return:
 
 ; CHECK-LABEL: pr18068
 ; CHECK: MayAlias: i32* %0, i32* %arrayidx5
-; CHECK: NoAlias: i32* %arrayidx13, i32* %arrayidx5
 
 define i32 @pr18068(i32* %jj7, i32* %j) {
 entry:
diff --git a/test/Analysis/BasicAA/zext.ll b/test/Analysis/BasicAA/zext.ll
deleted file mode 100644
index ed3565640251..000000000000
--- a/test/Analysis/BasicAA/zext.ll
+++ /dev/null
@@ -1,209 +0,0 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; CHECK-LABEL: test_with_zext
-; CHECK:  NoAlias: i8* %a, i8* %b
-
-define void @test_with_zext() {
-  %1 = tail call i8* @malloc(i64 120)
-  %a = getelementptr inbounds i8, i8* %1, i64 8
-  %2 = getelementptr inbounds i8, i8* %1, i64 16
-  %3 = zext i32 3 to i64
-  %b = getelementptr inbounds i8, i8* %2, i64 %3
-  ret void
-}
-
-; CHECK-LABEL: test_with_lshr
-; CHECK:  NoAlias: i8* %a, i8* %b
-
-define void @test_with_lshr(i64 %i) {
-  %1 = tail call i8* @malloc(i64 120)
-  %a = getelementptr inbounds i8, i8* %1, i64 8
-  %2 = getelementptr inbounds i8, i8* %1, i64 16
-  %3 = lshr i64 %i, 2
-  %b = getelementptr inbounds i8, i8* %2, i64 %3
-  ret void
-}
-
-; CHECK-LABEL: test_with_a_loop
-; CHECK:  NoAlias: i8* %a, i8* %b
-
-define void @test_with_a_loop(i8* %mem) {
-  br label %for.loop
-
-for.loop:
-  %i = phi i32 [ 0, %0 ], [ %i.plus1, %for.loop ]
-  %a = getelementptr inbounds i8, i8* %mem, i64 8
-  %a.plus1 = getelementptr inbounds i8, i8* %mem, i64 16
-  %i.64 = zext i32 %i to i64
-  %b = getelementptr inbounds i8, i8* %a.plus1, i64 %i.64
-  %i.plus1 = add nuw nsw i32 %i, 1
-  %cmp = icmp eq i32 %i.plus1, 10
-  br i1 %cmp, label %for.loop.exit, label %for.loop
-
-for.loop.exit:
-  ret void
-}
-
-; CHECK-LABEL: test_with_varying_base_pointer_in_loop
-; CHECK:  NoAlias: i8* %a, i8* %b
-
-define void @test_with_varying_base_pointer_in_loop(i8* %mem.orig) {
-  br label %for.loop
-
-for.loop:
-  %mem = phi i8* [ %mem.orig, %0 ], [ %mem.plus1, %for.loop ]
-  %i = phi i32 [ 0, %0 ], [ %i.plus1, %for.loop ]
-  %a = getelementptr inbounds i8, i8* %mem, i64 8
-  %a.plus1 = getelementptr inbounds i8, i8* %mem, i64 16
-  %i.64 = zext i32 %i to i64
-  %b = getelementptr inbounds i8, i8* %a.plus1, i64 %i.64
-  %i.plus1 = add nuw nsw i32 %i, 1
-  %mem.plus1 = getelementptr inbounds i8, i8* %mem, i64 8
-  %cmp = icmp eq i32 %i.plus1, 10
-  br i1 %cmp, label %for.loop.exit, label %for.loop
-
-for.loop.exit:
-  ret void
-}
-
-; CHECK-LABEL: test_sign_extension
-; CHECK:  PartialAlias: i64* %b.i64, i8* %a
-
-define void @test_sign_extension(i32 %p) {
-  %1 = tail call i8* @malloc(i64 120)
-  %p.64 = zext i32 %p to i64
-  %a = getelementptr inbounds i8, i8* %1, i64 %p.64
-  %p.minus1 = add i32 %p, -1
-  %p.minus1.64 = zext i32 %p.minus1 to i64
-  %b.i8 = getelementptr inbounds i8, i8* %1, i64 %p.minus1.64
-  %b.i64 = bitcast i8* %b.i8 to i64*
-  ret void
-}
-
-; CHECK-LABEL: test_fe_tools
-; CHECK:  PartialAlias: i32* %a, i32* %b
-
-define void @test_fe_tools([8 x i32]* %values) {
-  br label %reorder
-
-for.loop:
-  %i = phi i32 [ 0, %reorder ], [ %i.next, %for.loop ]
-  %idxprom = zext i32 %i to i64
-  %b = getelementptr inbounds [8 x i32], [8 x i32]* %values, i64 0, i64 %idxprom
-  %i.next = add nuw nsw i32 %i, 1
-  %1 = icmp eq i32 %i.next, 10
-  br i1 %1, label %for.loop.exit, label %for.loop
-
-reorder:
-  %a = getelementptr inbounds [8 x i32], [8 x i32]* %values, i64 0, i64 1
-  br label %for.loop
-
-for.loop.exit:
-  ret void
-}
-
-@b = global i32 0, align 4
-@d = global i32 0, align 4
-
-; CHECK-LABEL: test_spec2006
-; CHECK:  PartialAlias: i32** %x, i32** %y
-
-define void @test_spec2006() {
-  %h = alloca [1 x [2 x i32*]], align 16
-  %d.val = load i32, i32* @d, align 4
-  %d.promoted = sext i32 %d.val to i64
-  %1 = icmp slt i32 %d.val, 2
-  br i1 %1, label %.lr.ph, label %3
-
-.lr.ph:                                           ; preds = %0
-  br label %2
-
-; <label>:2                                       ; preds = %.lr.ph, %2
-  %i = phi i32 [ %d.val, %.lr.ph ], [ %i.plus1, %2 ]
-  %i.promoted = sext i32 %i to i64
-  %x = getelementptr inbounds [1 x [2 x i32*]], [1 x [2 x i32*]]* %h, i64 0, i64 %d.promoted, i64 %i.promoted
-  %i.plus1 = add nsw i32 %i, 1
-  %cmp = icmp slt i32 %i.plus1, 2
-  br i1 %cmp, label %2, label %3
-
-; <label>:3                                      ; preds = %._crit_edge, %0
-  %y = getelementptr inbounds [1 x [2 x i32*]], [1 x [2 x i32*]]* %h, i64 0, i64 0, i64 1
-  ret void
-}
-
-; CHECK-LABEL: test_modulo_analysis_easy_case
-; CHECK:  NoAlias: i32** %x, i32** %y
-
-define void @test_modulo_analysis_easy_case(i64 %i) {
-  %h = alloca [1 x [2 x i32*]], align 16
-  %x = getelementptr inbounds [1 x [2 x i32*]], [1 x [2 x i32*]]* %h, i64 0, i64 %i, i64 0
-  %y = getelementptr inbounds [1 x [2 x i32*]], [1 x [2 x i32*]]* %h, i64 0, i64 0, i64 1
-  ret void
-}
-
-; CHECK-LABEL: test_modulo_analysis_in_loop
-; CHECK:  NoAlias: i32** %x, i32** %y
-
-define void @test_modulo_analysis_in_loop() {
-  %h = alloca [1 x [2 x i32*]], align 16
-  br label %for.loop
-
-for.loop:
-  %i = phi i32 [ 0, %0 ], [ %i.plus1, %for.loop ]
-  %i.promoted = sext i32 %i to i64
-  %x = getelementptr inbounds [1 x [2 x i32*]], [1 x [2 x i32*]]* %h, i64 0, i64 %i.promoted, i64 0
-  %y = getelementptr inbounds [1 x [2 x i32*]], [1 x [2 x i32*]]* %h, i64 0, i64 0, i64 1
-  %i.plus1 = add nsw i32 %i, 1
-  %cmp = icmp slt i32 %i.plus1, 2
-  br i1 %cmp, label %for.loop, label %for.loop.exit
-
-for.loop.exit:
-  ret void
-}
-
-; CHECK-LABEL: test_modulo_analysis_with_global
-; CHECK:  PartialAlias: i32** %x, i32** %y
-
-define void @test_modulo_analysis_with_global() {
-  %h = alloca [1 x [2 x i32*]], align 16
-  %b = load i32, i32* @b, align 4
-  %b.promoted = sext i32 %b to i64
-  br label %for.loop
-
-for.loop:
-  %i = phi i32 [ 0, %0 ], [ %i.plus1, %for.loop ]
-  %i.promoted = sext i32 %i to i64
-  %x = getelementptr inbounds [1 x [2 x i32*]], [1 x [2 x i32*]]* %h, i64 0, i64 %i.promoted, i64 %b.promoted
-  %y = getelementptr inbounds [1 x [2 x i32*]], [1 x [2 x i32*]]* %h, i64 0, i64 0, i64 1
-  %i.plus1 = add nsw i32 %i, 1
-  %cmp = icmp slt i32 %i.plus1, 2
-  br i1 %cmp, label %for.loop, label %for.loop.exit
-
-for.loop.exit:
-  ret void
-}
-
-; CHECK-LABEL: test_const_eval
-; CHECK: NoAlias: i8* %a, i8* %b
-define void @test_const_eval(i8* %ptr, i64 %offset) {
-  %a = getelementptr inbounds i8, i8* %ptr, i64 %offset
-  %a.dup = getelementptr inbounds i8, i8* %ptr, i64 %offset
-  %three = zext i32 3 to i64
-  %b = getelementptr inbounds i8, i8* %a.dup, i64 %three
-  ret void
-}
-
-; CHECK-LABEL: test_const_eval_scaled
-; CHECK: MustAlias: i8* %a, i8* %b
-define void @test_const_eval_scaled(i8* %ptr) {
-  %three = zext i32 3 to i64
-  %six = mul i64 %three, 2
-  %a = getelementptr inbounds i8, i8* %ptr, i64 %six
-  %b = getelementptr inbounds i8, i8* %ptr, i64 6
-  ret void
-}
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64)
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index 77f7bd01b7f0..a68d110fdc96 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -1,12 +1,15 @@
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
 ; OPT-LABEL: @test_sink_global_small_offset_i32(
-; OPT-NOT: getelementptr i32, i32 addrspace(1)* %in
+; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in
+; OPT-VI: getelementptr i32, i32 addrspace(1)* %in
 ; OPT: br i1
-; OPT: ptrtoint
+; OPT-CI: ptrtoint
 
 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
 ; GCN: {{^}}BB0_2:
@@ -214,8 +217,11 @@ done:
 }
 
 ; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
+; VI-DAG: s_movk_i32 flat_scratch_lo, 0x0
+; VI-DAG: s_movk_i32 flat_scratch_hi, 0x0
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN: {{^}}BB7_2:
 define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) {
 entry:
diff --git a/test/CodeGen/AMDGPU/global_atomics.ll b/test/CodeGen/AMDGPU/global_atomics.ll
index 847950f6376e..146f0a5fbf26 100644
--- a/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/test/CodeGen/AMDGPU/global_atomics.ll
@@ -1,7 +1,9 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=VI --check-prefix=FUNC %s
+
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_offset:
-; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10,8 +12,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
-; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -22,6 +24,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+
 define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -32,7 +38,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -43,7 +52,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32:
-; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
@@ -51,8 +60,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret:
-; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
@@ -62,6 +71,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -71,7 +83,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64:
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -81,7 +96,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_offset:
-; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -90,8 +105,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
-; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -102,6 +117,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -112,7 +130,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -123,7 +144,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32:
-; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
@@ -131,8 +152,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret:
-; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
@@ -142,6 +163,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -151,7 +175,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64:
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -161,7 +188,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
-; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -170,8 +197,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
-; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -182,6 +209,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -192,7 +222,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -203,7 +236,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32:
-; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
@@ -211,8 +244,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret:
-; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
@@ -222,6 +255,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -231,7 +267,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64:
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -241,7 +280,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_offset:
-; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -250,8 +289,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
-; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -262,6 +301,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -272,7 +314,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -283,7 +328,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32:
-; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
@@ -291,8 +336,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret:
-; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
@@ -302,6 +347,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -311,7 +359,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64:
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -321,7 +372,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
-; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -330,8 +381,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
-; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -342,6 +393,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -352,7 +406,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -363,7 +420,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32:
-; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
@@ -371,8 +428,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret:
-; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
@@ -382,6 +439,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -391,7 +451,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64:
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -401,7 +464,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_offset:
-; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -410,8 +473,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
-; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -422,6 +485,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -432,7 +498,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -443,7 +512,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32:
-; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
@@ -451,8 +520,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret:
-; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
@@ -462,6 +531,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -471,7 +543,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64:
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -481,7 +556,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
-; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -490,8 +565,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
-; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -502,6 +577,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -512,7 +590,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -523,7 +604,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32:
-; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
@@ -532,7 +613,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
@@ -542,6 +623,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -551,7 +635,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -561,7 +648,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_offset:
-; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -570,8 +657,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
-; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -582,6 +669,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -592,7 +682,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -603,7 +696,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32:
-; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
@@ -611,8 +704,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret:
-; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
@@ -622,6 +715,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -631,7 +727,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64:
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -641,7 +740,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
-; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -650,8 +749,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
-; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -672,7 +771,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
 ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -683,7 +785,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32:
-; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
@@ -691,8 +793,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret:
-; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
@@ -702,6 +804,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -711,7 +816,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
 ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -721,7 +829,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
-; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -730,8 +838,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
-; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -742,6 +850,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -752,7 +863,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -763,7 +877,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32:
-; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
 define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
@@ -771,8 +885,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret:
-; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; SI: buffer_store_dword [[RET]]
+; GCN: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
@@ -782,6 +896,9 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
@@ -791,7 +908,10 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64:
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; SI: buffer_store_dword [[RET]]
+; VI: s_movk_i32 flat_scratch_lo, 0x0
+; VI: s_movk_i32 flat_scratch_hi, 0x0
+; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
index 3c1fc6c98f74..d4d13125cfbf 100644
--- a/test/CodeGen/AMDGPU/gv-const-addrspace.ll
+++ b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@@ -8,9 +8,7 @@
 @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
 
 ; FUNC-LABEL: {{^}}float:
-; FIXME: We should be using s_load_dword here.
-; SI: buffer_load_dword
-; VI: s_load_dword
+; GCN: s_load_dword
 
 ; EG-DAG: MOV {{\** *}}T2.X
 ; EG-DAG: MOV {{\** *}}T3.X
@@ -31,9 +29,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}i32:
 
-; FIXME: We should be using s_load_dword here.
-; SI: buffer_load_dword
-; VI: s_load_dword
+; GCN: s_load_dword
 
 ; EG-DAG: MOV {{\** *}}T2.X
 ; EG-DAG: MOV {{\** *}}T3.X
@@ -71,9 +67,7 @@ define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
                                                                 <1 x i32> <i32 4> ]
 
 ; FUNC-LABEL: {{^}}array_v1_gv_load:
-; FIXME: We should be using s_load_dword here.
-; SI: buffer_load_dword
-; VI: s_load_dword
+; GCN: s_load_dword
 define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
   %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll
index e098dd35d6da..6049dca04012 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll
@@ -11,8 +11,8 @@ declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone
 ; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
 ; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
 ; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
-; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
-; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
 ; CI: buffer_store_dwordx2 [[FRC]]
 define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
@@ -28,8 +28,8 @@ define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nou
 ; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
 ; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
 ; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
-; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
-; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
 ; CI: buffer_store_dwordx2 [[FRC]]
 define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
@@ -46,8 +46,8 @@ define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src)
 ; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
 ; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
 ; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
-; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
-; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
 ; CI: buffer_store_dwordx2 [[FRC]]
 define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
diff --git a/test/CodeGen/AMDGPU/private-memory.ll b/test/CodeGen/AMDGPU/private-memory.ll
index 1c5629780508..645dc04f4420 100644
--- a/test/CodeGen/AMDGPU/private-memory.ll
+++ b/test/CodeGen/AMDGPU/private-memory.ll
@@ -298,7 +298,7 @@ entry:
 ; FUNC-LABEL: ptrtoint:
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
-; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5
+; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
 define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll
index 56088718ada8..268869daaa32 100644
--- a/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -1,5 +1,7 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
 
 ; When a frame index offset is more than 12-bits, make sure we don't store
 ; it in mubuf's offset field.
@@ -8,11 +10,11 @@
 ; for both stores. This register is allocated by the register scavenger, so we
 ; should be able to reuse the same regiser for each scratch buffer access.
 
-; CHECK-LABEL: {{^}}legal_offset_fi:
-; CHECK: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0{{$}}
-; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
-; CHECK: v_mov_b32_e32 [[OFFSET]], 0x8000
-; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN-LABEL: {{^}}legal_offset_fi:
+; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
+; GCN: v_mov_b32_e32 [[OFFSET]], 0x8000
+; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 
 define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
 entry:
@@ -47,10 +49,10 @@ done:
 
 }
 
-; CHECK-LABEL: {{^}}legal_offset_fi_offset
-; CHECK: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
-; CHECK: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000
-; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN-LABEL: {{^}}legal_offset_fi_offset
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
+; GCN: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000
+; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 
 define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
 entry:
@@ -85,3 +87,30 @@ done:
   ret void
 }
 
+; GCN-LABEL: @neg_vaddr_offset
+; We can't prove %offset is positive, so we must do the computation with the
+; immediate in an add instruction instead of folding offset and the immediate into
+; the store instruction.
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
+define void @neg_vaddr_offset(i32 %offset) {
+entry:
+  %array = alloca [8192 x i32]
+  %ptr_offset = add i32 %offset, 4
+  %ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 %ptr_offset
+  store i32 0, i32* %ptr
+  ret void
+}
+
+; GCN-LABEL: @pos_vaddr_offse
+; DEFAULT-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16
+; HUGE-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
+define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
+entry:
+  %array = alloca [8192 x i32]
+  %ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 4
+  store i32 0, i32* %ptr
+  %load_ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 %offset
+  %val = load i32, i32* %load_ptr
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index b0c18ca5959c..0598208e1317 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -43,13 +43,7 @@ entry:
 ; GCN-LABEL: {{^}}smrd3:
 ; FIXME: There are too many copies here because we don't fold immediates
 ;        through REG_SEQUENCE
-; SI: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
-; SI: s_mov_b32 s[[SHI:[0-9]+]], 4
-; SI: s_mov_b32 s[[SSLO:[0-9]+]], s[[SLO]]
-; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SSLO]]
-; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
-; FIXME: We should be able to use s_load_dword here
-; SI: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64
+; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; GCN: s_endpgm
 define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index 56cdcaedf900..5411618ed86d 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -112,10 +112,10 @@ entry:
 }
 
 ; CHECK-LABEL: strd_spill_ldrd_reload:
-; A8: strd r1, r0, [sp, #-8]!
-; M3: strd r1, r0, [sp, #-8]!
-; BASIC: strd r1, r0, [sp, #-8]!
-; GREEDY: strd r0, r1, [sp, #-8]!
+; A8: strd r1, r0, [sp]
+; M3: strd r1, r0, [sp]
+; BASIC: strd r1, r0, [sp]
+; GREEDY: strd r0, r1, [sp]
 ; CHECK: @ InlineAsm Start
 ; CHECK: @ InlineAsm End
 ; A8: ldrd r2, r1, [sp]
@@ -131,53 +131,5 @@ define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) {
   ret void
 }
 
-declare void @extfunc2(i32*, i32, i32)
-
-; CHECK-LABEL: ldrd_postupdate_dec:
-; CHECK: ldrd r1, r2, [r0], #-8
-; CHECK-NEXT: bl{{x?}} _extfunc
-define void @ldrd_postupdate_dec(i32* %p0) {
-  %p0.1 = getelementptr i32, i32* %p0, i32 1
-  %v0 = load i32, i32* %p0
-  %v1 = load i32, i32* %p0.1
-  %p1 = getelementptr i32, i32* %p0, i32 -2
-  call void @extfunc2(i32* %p1, i32 %v0, i32 %v1)
-  ret void
-}
-
-; CHECK-LABEL: ldrd_postupdate_inc:
-; CHECK: ldrd r1, r2, [r0], #8
-; CHECK-NEXT: bl{{x?}} _extfunc
-define void @ldrd_postupdate_inc(i32* %p0) {
-  %p0.1 = getelementptr i32, i32* %p0, i32 1
-  %v0 = load i32, i32* %p0
-  %v1 = load i32, i32* %p0.1
-  %p1 = getelementptr i32, i32* %p0, i32 2
-  call void @extfunc2(i32* %p1, i32 %v0, i32 %v1)
-  ret void
-}
-
-; CHECK-LABEL: strd_postupdate_dec:
-; CHECK: strd r1, r2, [r0], #-8
-; CHECK-NEXT: bx lr
-define i32* @strd_postupdate_dec(i32* %p0, i32 %v0, i32 %v1) {
-  %p0.1 = getelementptr i32, i32* %p0, i32 1
-  store i32 %v0, i32* %p0
-  store i32 %v1, i32* %p0.1
-  %p1 = getelementptr i32, i32* %p0, i32 -2
-  ret i32* %p1
-}
-
-; CHECK-LABEL: strd_postupdate_inc:
-; CHECK: strd r1, r2, [r0], #8
-; CHECK-NEXT: bx lr
-define i32* @strd_postupdate_inc(i32* %p0, i32 %v0, i32 %v1) {
-  %p0.1 = getelementptr i32, i32* %p0, i32 1
-  store i32 %v0, i32* %p0
-  store i32 %v1, i32* %p0.1
-  %p1 = getelementptr i32, i32* %p0, i32 2
-  ret i32* %p1
-}
-
 declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/Mips/Fast-ISel/br1.ll b/test/CodeGen/Mips/Fast-ISel/br1.ll
index 11842ddc4188..a448e90187cb 100644
--- a/test/CodeGen/Mips/Fast-ISel/br1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/br1.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 @b = global i32 1, align 4
diff --git a/test/CodeGen/Mips/Fast-ISel/bswap1.ll b/test/CodeGen/Mips/Fast-ISel/bswap1.ll
index 8ac9753fa463..8f1f703ea078 100644
--- a/test/CodeGen/Mips/Fast-ISel/bswap1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/bswap1.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=mipsel -mcpu=mips32 -O0 -relocation-model=pic \
-; RUN:      -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s \
+; RUN:      -fast-isel-abort=1 | FileCheck %s \
 ; RUN:      -check-prefix=ALL -check-prefix=32R1
 ; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic \
-; RUN:      -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s \
+; RUN:      -fast-isel-abort=1 | FileCheck %s \
 ; RUN:      -check-prefix=ALL -check-prefix=32R2
 
 @a = global i16 -21829, align 2
diff --git a/test/CodeGen/Mips/Fast-ISel/callabi.ll b/test/CodeGen/Mips/Fast-ISel/callabi.ll
index 8f5d68b41f66..34616a50b1a0 100644
--- a/test/CodeGen/Mips/Fast-ISel/callabi.ll
+++ b/test/CodeGen/Mips/Fast-ISel/callabi.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=mipsel -mcpu=mips32 -O0 \
-; RUN:     -mips-fast-isel -relocation-model=pic -fast-isel-abort=1 < %s | \
+; RUN:     -relocation-model=pic -fast-isel-abort=1 < %s | \
 ; RUN:     FileCheck %s -check-prefix=ALL -check-prefix=32R1
 ; RUN: llc -march=mipsel -mcpu=mips32r2 -O0 \
-; RUN:     -mips-fast-isel -relocation-model=pic -fast-isel-abort=1 < %s | \
+; RUN:     -relocation-model=pic -fast-isel-abort=1 < %s | \
 ; RUN:     FileCheck %s -check-prefix=ALL -check-prefix=32R2
 
 declare void @xb(i8)
diff --git a/test/CodeGen/Mips/Fast-ISel/constexpr-address.ll b/test/CodeGen/Mips/Fast-ISel/constexpr-address.ll
index df60d8071836..d6d9074c7c19 100644
--- a/test/CodeGen/Mips/Fast-ISel/constexpr-address.ll
+++ b/test/CodeGen/Mips/Fast-ISel/constexpr-address.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=mipsel -mcpu=mips32 -relocation-model=pic \
-; RUN:     -fast-isel=true -mips-fast-isel -fast-isel-abort=1 < %s | FileCheck %s
+; RUN:     -fast-isel=true -fast-isel-abort=1 < %s | FileCheck %s
 ; RUN: llc -march=mipsel -mcpu=mips32r2 -relocation-model=pic \
-; RUN:     -fast-isel=true -mips-fast-isel -fast-isel-abort=1 < %s | FileCheck %s
+; RUN:     -fast-isel=true -fast-isel-abort=1 < %s | FileCheck %s
 
 @ARR = external global [10 x i32], align 4
 
diff --git a/test/CodeGen/Mips/Fast-ISel/div1.ll b/test/CodeGen/Mips/Fast-ISel/div1.ll
index 89e7f211251f..89055aa12805 100644
--- a/test/CodeGen/Mips/Fast-ISel/div1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/div1.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=mipsel -mcpu=mips32 -O0 -relocation-model=pic \
-; RUN:      -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s
+; RUN:      -fast-isel-abort=1 | FileCheck %s
 ; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic \
-; RUN:      -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s
+; RUN:      -fast-isel-abort=1 | FileCheck %s
 
 @sj = global i32 200000, align 4
 @sk = global i32 -47, align 4
diff --git a/test/CodeGen/Mips/Fast-ISel/fastalloca.ll b/test/CodeGen/Mips/Fast-ISel/fastalloca.ll
index b4a9f1ce7ab0..00bc7f485e08 100644
--- a/test/CodeGen/Mips/Fast-ISel/fastalloca.ll
+++ b/test/CodeGen/Mips/Fast-ISel/fastalloca.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
 
 %struct.x = type { i32 }
diff --git a/test/CodeGen/Mips/Fast-ISel/fastcc-miss.ll b/test/CodeGen/Mips/Fast-ISel/fastcc-miss.ll
new file mode 100644
index 000000000000..d9ce8b3964a4
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/fastcc-miss.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic \
+; RUN:     -fast-isel-verbose 2>&1 | FileCheck %s
+
+; CHECK:      FastISel missed call:
+; CHECK-SAME: %call = call fastcc i32 @foo(i32 signext %a, i32 signext %b)
+
+define internal i32 @bar(i32 signext %a, i32 signext %b) {
+  %s = and i32 %a, %b
+  ret i32 %s
+}
+
+define i32 @foo(i32 signext %a, i32 signext %b) {
+  %call = call fastcc i32 @foo(i32 signext %a, i32 signext %b)
+  ret i32 %call
+}
diff --git a/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll b/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll
index 72de888b26e0..e346acfeff13 100644
--- a/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll
+++ b/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 @f1 = common global float 0.000000e+00, align 4
diff --git a/test/CodeGen/Mips/Fast-ISel/fpext.ll b/test/CodeGen/Mips/Fast-ISel/fpext.ll
index 5ac22490ff02..f78289f40a02 100644
--- a/test/CodeGen/Mips/Fast-ISel/fpext.ll
+++ b/test/CodeGen/Mips/Fast-ISel/fpext.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 @f = global float 0x40147E6B80000000, align 4
diff --git a/test/CodeGen/Mips/Fast-ISel/fpintconv.ll b/test/CodeGen/Mips/Fast-ISel/fpintconv.ll
index a94ef5081539..2c022be5b3f7 100644
--- a/test/CodeGen/Mips/Fast-ISel/fpintconv.ll
+++ b/test/CodeGen/Mips/Fast-ISel/fpintconv.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 
diff --git a/test/CodeGen/Mips/Fast-ISel/fptrunc.ll b/test/CodeGen/Mips/Fast-ISel/fptrunc.ll
index 2eec4c3ef547..89a7bfce5b05 100644
--- a/test/CodeGen/Mips/Fast-ISel/fptrunc.ll
+++ b/test/CodeGen/Mips/Fast-ISel/fptrunc.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 @d = global double 0x40147E6B74DF0446, align 8
diff --git a/test/CodeGen/Mips/Fast-ISel/icmpa.ll b/test/CodeGen/Mips/Fast-ISel/icmpa.ll
index 670a8d5cfb4e..fc37e118e755 100644
--- a/test/CodeGen/Mips/Fast-ISel/icmpa.ll
+++ b/test/CodeGen/Mips/Fast-ISel/icmpa.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 @c = global i32 4, align 4
diff --git a/test/CodeGen/Mips/Fast-ISel/loadstore2.ll b/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
index 3daf03d681cb..46f7a42a5fef 100644
--- a/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
+++ b/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
@@ -4,9 +4,9 @@ target triple = "mips--linux-gnu"
 
 @c2 = common global i8 0, align 1
 @c1 = common global i8 0, align 1
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 @s2 = common global i16 0, align 2
diff --git a/test/CodeGen/Mips/Fast-ISel/loadstoreconv.ll b/test/CodeGen/Mips/Fast-ISel/loadstoreconv.ll
index acba132b28e1..09b56d2c87ec 100644
--- a/test/CodeGen/Mips/Fast-ISel/loadstoreconv.ll
+++ b/test/CodeGen/Mips/Fast-ISel/loadstoreconv.ll
@@ -1,10 +1,10 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s -check-prefix=mips32r2
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s -check-prefix=mips32
 
 @b2 = global i8 0, align 1
diff --git a/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll b/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll
index 9f644ecd1875..1051b2800e5b 100644
--- a/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll
+++ b/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 @.str = private unnamed_addr constant [6 x i8] c"hello\00", align 1
diff --git a/test/CodeGen/Mips/Fast-ISel/logopm.ll b/test/CodeGen/Mips/Fast-ISel/logopm.ll
index 0f0c3bf9e1dc..fec85092fffd 100644
--- a/test/CodeGen/Mips/Fast-ISel/logopm.ll
+++ b/test/CodeGen/Mips/Fast-ISel/logopm.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 < %s | FileCheck %s
 
 @ub1 = common global i8 0, align 1
 @ub2 = common global i8 0, align 1
@@ -283,8 +283,8 @@ entry:
 ; CHECK-DAG:    lw      $[[UC_ADDR:[0-9]+]], %got(uc)($[[REG_GP]])
 ; CHECK-DAG:    lw      $[[UC1_ADDR:[0-9]+]], %got(uc1)($[[REG_GP]])
 ; CHECK-DAG:    lbu     $[[UC1:[0-9]+]], 0($[[UC1_ADDR]])
-; CHECK-DAG:    addiu   $[[CONST_Neg89:[0-9]+]], $zero, -89
-; CHECK-DAG:    and     $[[RES:[0-9]+]], $[[UC1]], $[[CONST_Neg89]]
+; CHECK-DAG:    addiu   $[[CONST_167:[0-9]+]], $zero, 167
+; CHECK-DAG:    and     $[[RES:[0-9]+]], $[[UC1]], $[[CONST_167]]
 ; CHECK:        sb      $[[RES]], 0($[[UC_ADDR]])
 ; CHECK:        .end    andUc1
   ret void
@@ -345,8 +345,8 @@ entry:
 ; CHECK-DAG:    lw      $[[UC_ADDR:[0-9]+]], %got(uc)($[[REG_GP]])
 ; CHECK-DAG:    lw      $[[UC1_ADDR:[0-9]+]], %got(uc1)($[[REG_GP]])
 ; CHECK-DAG:    lbu     $[[UC1:[0-9]+]], 0($[[UC1_ADDR]])
-; CHECK-DAG:    addiu   $[[CONST_neg18:[0-9]+]], $zero, -18
-; CHECK-DAG:    or      $[[RES:[0-9]+]], $[[UC1]], $[[CONST_neg18]]
+; CHECK-DAG:    addiu   $[[CONST_238:[0-9]+]], $zero, 238
+; CHECK-DAG:    or      $[[RES:[0-9]+]], $[[UC1]], $[[CONST_238]]
 ; CHECK:        sb      $[[RES]], 0($[[UC_ADDR]])
 ; CHECK:        .end    orUc1
   ret void
@@ -469,8 +469,8 @@ entry:
 ; CHECK-DAG:    lw      $[[US_ADDR:[0-9]+]], %got(us)($[[REG_GP]])
 ; CHECK-DAG:    lw      $[[US1_ADDR:[0-9]+]], %got(us1)($[[REG_GP]])
 ; CHECK-DAG:    lhu     $[[US1:[0-9]+]], 0($[[US1_ADDR]])
-; CHECK-DAG:    addiu   $[[CONST_Neg4185:[0-9]+]], $zero, -4185
-; CHECK-DAG:    and     $[[RES:[0-9]+]], $[[US1]], $[[CONST_Neg4185]]
+; CHECK-DAG:    ori     $[[CONST_61351:[0-9]+]], $zero, 61351
+; CHECK-DAG:    and     $[[RES:[0-9]+]], $[[US1]], $[[CONST_61351]]
 ; CHECK:        sh      $[[RES]], 0($[[US_ADDR]])
 ; CHECK:        .end    andUs1
   ret void
@@ -520,8 +520,8 @@ entry:
 ; CHECK-DAG:    lw      $[[US_ADDR:[0-9]+]], %got(us)($[[REG_GP]])
 ; CHECK-DAG:    lw      $[[US1_ADDR:[0-9]+]], %got(us1)($[[REG_GP]])
 ; CHECK-DAG:    lhu     $[[US1:[0-9]+]], 0($[[US1_ADDR]])
-; CHECK-DAG:    addiu   $[[CONST_neg4591:[0-9]+]], $zero, -4591
-; CHECK-DAG:    or      $[[RES:[0-9]+]], $[[US1]], $[[CONST_neg4591]]
+; CHECK-DAG:    ori     $[[CONST_60945:[0-9]+]], $zero, 60945
+; CHECK-DAG:    or      $[[RES:[0-9]+]], $[[US1]], $[[CONST_60945]]
 ; CHECK:        sh      $[[RES]], 0($[[US_ADDR]])
 ; CHECK:        .end    orUs1
   ret void
@@ -583,8 +583,8 @@ entry:
 ; CHECK-DAG:    lw      $[[US_ADDR:[0-9]+]], %got(us)($[[REG_GP]])
 ; CHECK-DAG:    lw      $[[US1_ADDR:[0-9]+]], %got(us1)($[[REG_GP]])
 ; CHECK-DAG:    lhu     $[[US1:[0-9]+]], 0($[[US1_ADDR]])
-; CHECK-DAG:    addiu   $[[CONST_Neg5512:[0-9]+]], $zero, -5512
-; CHECK-DAG:    xor     $[[RES:[0-9]+]], $[[US1]], $[[CONST_Neg5512]]
+; CHECK-DAG:    ori     $[[CONST_60024:[0-9]+]], $zero, 60024
+; CHECK-DAG:    xor     $[[RES:[0-9]+]], $[[US1]], $[[CONST_60024]]
 ; CHECK:        sh      $[[RES]], 0($[[US_ADDR]])
 ; CHECK:        .end    xorUs1
   ret void
diff --git a/test/CodeGen/Mips/Fast-ISel/memtest1.ll b/test/CodeGen/Mips/Fast-ISel/memtest1.ll
index a3fc4a32981c..b98200d7456d 100644
--- a/test/CodeGen/Mips/Fast-ISel/memtest1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/memtest1.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=mipsel -mcpu=mips32 -O0 -relocation-model=pic \
-; RUN:     -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s \
+; RUN:     -fast-isel-abort=1 | FileCheck %s \
 ; RUN:     -check-prefix=ALL -check-prefix=32R1
 ; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic \
-; RUN:     -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s \
+; RUN:     -fast-isel-abort=1 | FileCheck %s \
 ; RUN:     -check-prefix=ALL -check-prefix=32R2
 
 @str = private unnamed_addr constant [12 x i8] c"hello there\00", align 1
diff --git a/test/CodeGen/Mips/Fast-ISel/mul1.ll b/test/CodeGen/Mips/Fast-ISel/mul1.ll
index 0ee044bea0a7..8713e7ef1d96 100644
--- a/test/CodeGen/Mips/Fast-ISel/mul1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/mul1.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s -march=mipsel -mcpu=mips32 -O0 \
-; RUN:    -fast-isel -mips-fast-isel -relocation-model=pic
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 \
-; RUN:    -fast-isel -mips-fast-isel -relocation-model=pic
+; RUN: llc < %s -march=mipsel -mcpu=mips32 -O0 -relocation-model=pic
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic
 
 ; The test is just to make sure it is able to allocate
 ; registers for this example. There was an issue with allocating AC0
diff --git a/test/CodeGen/Mips/Fast-ISel/nullvoid.ll b/test/CodeGen/Mips/Fast-ISel/nullvoid.ll
index 5fa3f13ace4c..106015e30c35 100644
--- a/test/CodeGen/Mips/Fast-ISel/nullvoid.ll
+++ b/test/CodeGen/Mips/Fast-ISel/nullvoid.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 ; Function Attrs: nounwind
diff --git a/test/CodeGen/Mips/Fast-ISel/overflt.ll b/test/CodeGen/Mips/Fast-ISel/overflt.ll
index 57f991e23d95..37e87b29c58e 100644
--- a/test/CodeGen/Mips/Fast-ISel/overflt.ll
+++ b/test/CodeGen/Mips/Fast-ISel/overflt.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 @x = common global [128000 x float] zeroinitializer, align 4
diff --git a/test/CodeGen/Mips/Fast-ISel/rem1.ll b/test/CodeGen/Mips/Fast-ISel/rem1.ll
index 9b5e440d0eaa..cf709e7e4954 100644
--- a/test/CodeGen/Mips/Fast-ISel/rem1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/rem1.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=mipsel -mcpu=mips32 -O0 -relocation-model=pic \
-; RUN:      -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s
+; RUN:      -fast-isel-abort=1 | FileCheck %s
 ; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic \
-; RUN:      -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s
+; RUN:      -fast-isel-abort=1 | FileCheck %s
 
 @sj = global i32 200, align 4
 @sk = global i32 -47, align 4
diff --git a/test/CodeGen/Mips/Fast-ISel/retabi.ll b/test/CodeGen/Mips/Fast-ISel/retabi.ll
index 03119b827eb6..20747c4ed206 100644
--- a/test/CodeGen/Mips/Fast-ISel/retabi.ll
+++ b/test/CodeGen/Mips/Fast-ISel/retabi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
 
 @i = global i32 75, align 4
diff --git a/test/CodeGen/Mips/Fast-ISel/sel1.ll b/test/CodeGen/Mips/Fast-ISel/sel1.ll
index 47b6a895cde8..8f762b0ed088 100644
--- a/test/CodeGen/Mips/Fast-ISel/sel1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/sel1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O2 -relocation-model=pic \
-; RUN:          -fast-isel -mips-fast-isel -fast-isel-abort=1 | FileCheck %s
+; RUN:          -fast-isel -fast-isel-abort=1 | FileCheck %s
 
 define i1 @sel_i1(i1 %j, i1 %k, i1 %l) {
 entry:
@@ -8,7 +8,8 @@ entry:
   ; FIXME: The following instruction is redundant.
   ; CHECK:            xor     $[[T0:[0-9]+]], $4, $zero
   ; CHECK-NEXT:       sltu    $[[T1:[0-9]+]], $zero, $[[T0]]
-  ; CHECK-NEXT:       movn    $6, $5, $[[T1]]
+  ; CHECK-NEXT:       andi    $[[T2:[0-9]+]], $[[T1]], 1
+  ; CHECK-NEXT:       movn    $6, $5, $[[T2]]
   ; CHECK:            move    $2, $6
   %cond = icmp ne i1 %j, 0
   %res = select i1 %cond, i1 %k, i1 %l
@@ -24,7 +25,8 @@ entry:
   ; CHECK-DAG:        seb     $[[T1:[0-9]+]], $zero
   ; CHECK:            xor     $[[T2:[0-9]+]], $[[T0]], $[[T1]]
   ; CHECK-NEXT:       sltu    $[[T3:[0-9]+]], $zero, $[[T2]]
-  ; CHECK-NEXT:       movn    $6, $5, $[[T3]]
+  ; CHECK-NEXT:       andi    $[[T4:[0-9]+]], $[[T3]], 1
+  ; CHECK-NEXT:       movn    $6, $5, $[[T4]]
   ; CHECK:            move    $2, $6
   %cond = icmp ne i8 %j, 0
   %res = select i1 %cond, i8 %k, i8 %l
@@ -40,7 +42,8 @@ entry:
   ; CHECK-DAG:        seh     $[[T1:[0-9]+]], $zero
   ; CHECK:            xor     $[[T2:[0-9]+]], $[[T0]], $[[T1]]
   ; CHECK-NEXT:       sltu    $[[T3:[0-9]+]], $zero, $[[T2]]
-  ; CHECK-NEXT:       movn    $6, $5, $[[T3]]
+  ; CHECK-NEXT:       andi    $[[T4:[0-9]+]], $[[T3]], 1
+  ; CHECK-NEXT:       movn    $6, $5, $[[T4]]
   ; CHECK:            move    $2, $6
   %cond = icmp ne i16 %j, 0
   %res = select i1 %cond, i16 %k, i16 %l
@@ -54,7 +57,8 @@ entry:
   ; FIXME: The following instruction is redundant.
   ; CHECK:            xor     $[[T0:[0-9]+]], $4, $zero
   ; CHECK-NEXT:       sltu    $[[T1:[0-9]+]], $zero, $[[T0]]
-  ; CHECK-NEXT:       movn    $6, $5, $[[T1]]
+  ; CHECK-NEXT:       andi    $[[T2:[0-9]+]], $[[T1]], 1
+  ; CHECK-NEXT:       movn    $6, $5, $[[T2]]
   ; CHECK:            move    $2, $6
   %cond = icmp ne i32 %j, 0
   %res = select i1 %cond, i32 %k, i32 %l
@@ -69,7 +73,8 @@ entry:
   ; CHECK-DAG:        mtc1    $5, $f1
   ; CHECK-DAG:        xor     $[[T0:[0-9]+]], $4, $zero
   ; CHECK:            sltu    $[[T1:[0-9]+]], $zero, $[[T0]]
-  ; CHECK:            movn.s  $f0, $f1, $[[T1]]
+  ; CHECK-NEXT:       andi    $[[T2:[0-9]+]], $[[T1]], 1
+  ; CHECK:            movn.s  $f0, $f1, $[[T2]]
   %cond = icmp ne i32 %j, 0
   %res = select i1 %cond, float %k, float %l
   ret float %res
@@ -84,7 +89,8 @@ entry:
   ; CHECK-DAG:        ldc1    $f0, 16($sp)
   ; CHECK-DAG:        xor     $[[T0:[0-9]+]], $4, $zero
   ; CHECK:            sltu    $[[T1:[0-9]+]], $zero, $[[T0]]
-  ; CHECK:            movn.d  $f0, $f2, $[[T1]]
+  ; CHECK-NEXT:       andi    $[[T2:[0-9]+]], $[[T1]], 1
+  ; CHECK:            movn.d  $f0, $f2, $[[T2]]
   %cond = icmp ne i32 %j, 0
   %res = select i1 %cond, double %k, double %l
   ret double %res
diff --git a/test/CodeGen/Mips/Fast-ISel/shftopm.ll b/test/CodeGen/Mips/Fast-ISel/shftopm.ll
index 90ddd190be13..bbea9c5566c5 100644
--- a/test/CodeGen/Mips/Fast-ISel/shftopm.ll
+++ b/test/CodeGen/Mips/Fast-ISel/shftopm.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 \
 ; RUN:     -fast-isel-abort=1 -mcpu=mips32r2  < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 \
 ; RUN:     -fast-isel-abort=1 -mcpu=mips32 < %s | FileCheck %s
 
 @s1 = global i16 -89, align 2
diff --git a/test/CodeGen/Mips/Fast-ISel/shift.ll b/test/CodeGen/Mips/Fast-ISel/shift.ll
index df1c82700d59..9fe694bb5827 100644
--- a/test/CodeGen/Mips/Fast-ISel/shift.ll
+++ b/test/CodeGen/Mips/Fast-ISel/shift.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips32r2 -O1 -fast-isel=true -mips-fast-isel -filetype=obj %s -o - \
+; RUN: llc -march=mipsel -mcpu=mips32r2 -O0 -fast-isel=true -filetype=obj %s -o - \
 ; RUN:   | llvm-objdump -arch mipsel -mcpu=mips32r2 -d - | FileCheck %s
 
 ; This test checks that encoding for srl is correct when fast-isel for mips32r2 is used.
diff --git a/test/CodeGen/Mips/Fast-ISel/simplestore.ll b/test/CodeGen/Mips/Fast-ISel/simplestore.ll
index bcb198b1a823..627a383f597c 100644
--- a/test/CodeGen/Mips/Fast-ISel/simplestore.ll
+++ b/test/CodeGen/Mips/Fast-ISel/simplestore.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 @abcd = external global i32
diff --git a/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll b/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
index f4b91d850255..62101d8ef7eb 100644
--- a/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
@@ -1,10 +1,10 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s 
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s -check-prefix=mips32r2 
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s -check-prefix=mips32
 
 @f = common global float 0.000000e+00, align 4
diff --git a/test/CodeGen/Mips/Fast-ISel/simplestorei.ll b/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
index 83ccae0b1de5..67541b54bae7 100644
--- a/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
+++ b/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32r2 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort=1 -mcpu=mips32 \
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 @ijk = external global i32
@@ -22,9 +22,10 @@ define void @si2_2() #0 {
 entry:
   store i32 -32768, i32* @ijk, align 4
 ; CHECK:        .ent    si2_2
-; CHECK:        addiu   $[[REG1:[0-9]+]], $zero, -32768
-; CHECK:        lw      $[[REG2:[0-9]+]], %got(ijk)(${{[0-9]+}})
-; CHECK:        sw      $[[REG1]], 0($[[REG2]])
+; CHECK:        lui     $[[REG1:[0-9]+]], 65535
+; CHECK:        ori     $[[REG2:[0-9]+]], $[[REG1]], 32768
+; CHECK:        lw      $[[REG3:[0-9]+]], %got(ijk)(${{[0-9]+}})
+; CHECK:        sw      $[[REG2]], 0($[[REG3]])
   ret void
 }
 
diff --git a/test/CodeGen/Mips/delay-slot-kill.ll b/test/CodeGen/Mips/delay-slot-kill.ll
index 57b630303c26..5e301441fd26 100644
--- a/test/CodeGen/Mips/delay-slot-kill.ll
+++ b/test/CodeGen/Mips/delay-slot-kill.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s
+; We have to XFAIL this temporarily because of the reversion of r229675.
+; XFAIL: *
 
 ; Currently, the following IR assembly generates a KILL instruction between
 ; the bitwise-and instruction and the return instruction. We verify that the
diff --git a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
index 779620e10128..58dd16c9f9c8 100644
--- a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
+++ b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
@@ -1,10 +1,10 @@
 ; Check that register scavenging spill slot is close to $fp.
-; RUN: llc -march=mipsel -O0 < %s | FileCheck %s
+; RUN: llc -march=mipsel -O0 -fast-isel=false < %s | FileCheck %s
 
-; CHECK: sw ${{.*}}, 4($sp)
-; CHECK: lw ${{.*}}, 4($sp)
+; CHECK: sw ${{.*}}, 8($sp)
+; CHECK: lw ${{.*}}, 8($sp)
 
-define i32 @main(i32 signext %argc, i8** %argv) "no-frame-pointer-elim"="true" {
+define i32 @main(i32 signext %argc, i8** %argv) #0 {
 entry:
   %retval = alloca i32, align 4
   %argc.addr = alloca i32, align 4
@@ -30,3 +30,5 @@ entry:
   store <16 x i8> %mul, <16 x i8>* %result, align 16
   ret i32 0
 }
+
+attributes #0 = { noinline optnone "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/Mips/llvm-ir/and.ll b/test/CodeGen/Mips/llvm-ir/and.ll
index 8ebcfe4a3f64..c4121701ec15 100644
--- a/test/CodeGen/Mips/llvm-ir/and.ll
+++ b/test/CodeGen/Mips/llvm-ir/and.ll
@@ -59,7 +59,10 @@ define signext i32 @and_i32(i32 signext %a, i32 signext %b) {
 entry:
 ; ALL-LABEL: and_i32:
 
-  ; ALL:          and     $2, $4, $5
+  ; GP32:         and     $2, $4, $5
+
+  ; GP64:         and     $[[T0:[0-9]+]], $4, $5
+  ; GP64:         sll     $2, $[[T0]], 0
 
   %r = and i32 %a, %b
   ret i32 %r
diff --git a/test/CodeGen/Mips/llvm-ir/or.ll b/test/CodeGen/Mips/llvm-ir/or.ll
index 6215e4036325..8509d6ce93f3 100644
--- a/test/CodeGen/Mips/llvm-ir/or.ll
+++ b/test/CodeGen/Mips/llvm-ir/or.ll
@@ -59,7 +59,11 @@ define signext i32 @or_i32(i32 signext %a, i32 signext %b) {
 entry:
 ; ALL-LABEL: or_i32:
 
-  ; ALL:          or     $2, $4, $5
+  ; GP32:         or     $2, $4, $5
+
+  ; GP64:         or     $[[T0:[0-9]+]], $4, $5
+  ; FIXME: The sll instruction below is redundant.
+  ; GP64:         sll     $2, $[[T0]], 0
 
   %r = or i32 %a, %b
   ret i32 %r
diff --git a/test/CodeGen/Mips/llvm-ir/xor.ll b/test/CodeGen/Mips/llvm-ir/xor.ll
index 89af99981a3c..d3cc57484895 100644
--- a/test/CodeGen/Mips/llvm-ir/xor.ll
+++ b/test/CodeGen/Mips/llvm-ir/xor.ll
@@ -59,7 +59,10 @@ define signext i32 @xor_i32(i32 signext %a, i32 signext %b) {
 entry:
 ; ALL-LABEL: xor_i32:
 
-  ; ALL:          xor     $2, $4, $5
+  ; GP32:         xor     $2, $4, $5
+
+  ; GP64:         xor     $[[T0:[0-9]+]], $4, $5
+  ; GP64:         sll     $2, $[[T0]], 0
 
   %r = xor i32 %a, %b
   ret i32 %r
diff --git a/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll b/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll
new file mode 100644
index 000000000000..7742ffe33150
--- /dev/null
+++ b/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mcpu=a2 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define linkonce_odr double @test1() {
+entry:
+  %conv6.i.i = fptosi ppc_fp128 undef to i64
+  %conv.i = sitofp i64 %conv6.i.i to double
+  ret double %conv.i
+
+; CHECK-LABEL: @test1
+; CHECK: bl __fixtfdi
+; CHECK: fcfid
+; CHECK: blr
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64-patchpoint.ll b/test/CodeGen/PowerPC/ppc64-patchpoint.ll
index 53b737ae9a0b..d10ea98cd1a7 100644
--- a/test/CodeGen/PowerPC/ppc64-patchpoint.ll
+++ b/test/CodeGen/PowerPC/ppc64-patchpoint.ll
@@ -103,6 +103,21 @@ entry:
   ret void
 }
 
+; Trivial symbolic patchpoint codegen.
+
+declare i64 @foo(i64 %p1, i64 %p2)
+define i64 @trivial_symbolic_patchpoint_codegen(i64 %p1, i64 %p2) {
+entry:
+; CHECK-LABEL: trivial_symbolic_patchpoint_codegen:
+; CHECK:       bl foo
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NOT:   nop
+; CHECK:       blr
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 9, i32 12, i8* bitcast (i64 (i64, i64)* @foo to i8*), i32 2, i64 %p1, i64 %p2)
+  ret i64 %result
+}
+
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/PowerPC/pr24216.ll b/test/CodeGen/PowerPC/pr24216.ll
new file mode 100644
index 000000000000..4ab41985f5b1
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr24216.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+
+; Test case adapted from PR24216.
+
+define void @foo(<16 x i8>* nocapture readonly %in, <16 x i8>* nocapture %out) {
+entry:
+  %0 = load <16 x i8>, <16 x i8>* %in, align 16
+  %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 2, i32 3, i32 4, i32 5, i32 2, i32 3, i32 4, i32 5, i32 2, i32 3, i32 4, i32 5>
+  store <16 x i8> %1, <16 x i8>* %out, align 16
+  ret void
+}
+
+; CHECK: vperm
+; CHECK-NOT: vspltw
diff --git a/test/CodeGen/PowerPC/vec_shuffle_le.ll b/test/CodeGen/PowerPC/vec_shuffle_le.ll
index 46d451ff1573..65c47ada8750 100644
--- a/test/CodeGen/PowerPC/vec_shuffle_le.ll
+++ b/test/CodeGen/PowerPC/vec_shuffle_le.ll
@@ -202,7 +202,7 @@ entry:
 ; CHECK: VSLDOI_xx:
         %tmp = load <16 x i8>, <16 x i8>* %A
         %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK: vsldoi
+; CHECK: vsldoi {{[0-9]+}}, [[REG1:[0-9]+]], [[REG1]], 4
         store <16 x i8> %tmp2, <16 x i8>* %A
         ret void
 }
diff --git a/test/CodeGen/PowerPC/vsx.ll b/test/CodeGen/PowerPC/vsx.ll
index f85acebeea67..dceb2516c696 100644
--- a/test/CodeGen/PowerPC/vsx.ll
+++ b/test/CodeGen/PowerPC/vsx.ll
@@ -70,10 +70,10 @@ entry:
 ; CHECK-REG: blr
 
 ; CHECK-FISL-LABEL: @test5
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlxor 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: vor
+; CHECK-FISL: vor
+; CHECK-FISL: xxlxor
+; CHECK-FISL: vor 2
 ; CHECK-FISL: blr
 
 ; CHECK-LE-LABEL: @test5
@@ -133,10 +133,10 @@ entry:
 ; CHECK-REG: blr
 
 ; CHECK-FISL-LABEL: @test8
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlor 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: vor
+; CHECK-FISL: vor
+; CHECK-FISL: xxlor
+; CHECK-FISL: vor 2
 ; CHECK-FISL: blr
 
 ; CHECK-LE-LABEL: @test8
@@ -196,10 +196,10 @@ entry:
 ; CHECK-REG: blr
 
 ; CHECK-FISL-LABEL: @test11
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxland 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: vor
+; CHECK-FISL: vor
+; CHECK-FISL: xxland
+; CHECK-FISL: vor 2
 ; CHECK-FISL: blr
 
 ; CHECK-LE-LABEL: @test11
@@ -260,17 +260,14 @@ entry:
 ; CHECK-REG: blr
 
 ; CHECK-FISL-LABEL: @test14
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlor 36, 36, 37
-; CHECK-FISL: vor 0, 4, 4
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlnor 36, 36, 37
+; CHECK-FISL: vor 4, 3, 3
+; CHECK-FISL: vor 5, 2, 2
+; CHECK-FISL: xxlor 0, 37, 36
+; CHECK-FISL: xxlnor 36, 37, 36
 ; CHECK-FISL: vor 2, 4, 4
 ; CHECK-FISL: lis 0, -1
 ; CHECK-FISL: ori 0, 0, 65520
-; CHECK-FISL: stvx 0, 1, 0
+; CHECK-FISL: stxvd2x 0, 1, 0
 ; CHECK-FISL: blr
 
 ; CHECK-LE-LABEL: @test14
@@ -347,15 +344,13 @@ entry:
 ; CHECK-REG: blr
 
 ; CHECK-FISL-LABEL: @test17
-; CHECK-FISL: vspltisb 4, -1
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: vor 0, 4, 4
-; CHECK-FISL: xxlxor 37, 37, 32
-; CHECK-FISL: vor 3, 5, 5
+; CHECK-FISL: vor 4, 3, 3
 ; CHECK-FISL: vor 5, 2, 2
-; CHECK-FISL: vor 0, 3, 3
-; CHECK-FISL: xxland 37, 37, 32
-; CHECK-FISL: vor 2, 5, 5
+; CHECK-FISL: vspltisb 2, -1
+; CHECK-FISL: vor 0, 2, 2
+; CHECK-FISL: xxlxor 36, 36, 32
+; CHECK-FISL: xxland 36, 37, 36
+; CHECK-FISL: vor 2, 4, 4
 ; CHECK-FISL: blr
 
 ; CHECK-LE-LABEL: @test17
@@ -434,12 +429,18 @@ entry:
 ; CHECK-REG: xxsel 34, 35, 34, {{[0-9]+}}
 ; CHECK-REG: blr
 
+; FIXME: The fast-isel code is pretty miserable for this one.
+
 ; CHECK-FISL-LABEL: @test20
-; CHECK-FISL: vcmpequw 4, 4, 5
-; CHECK-FISL: vor 0, 3, 3
-; CHECK-FISL: vor 1, 2, 2
-; CHECK-FISL: vor 6, 4, 4
-; CHECK-FISL: xxsel 32, 32, 33, 38
+; CHECK-FISL: vor 0, 5, 5
+; CHECK-FISL: vor 1, 4, 4
+; CHECK-FISL: vor 6, 3, 3
+; CHECK-FISL: vor 7, 2, 2
+; CHECK-FISL: vor 2, 1, 1
+; CHECK-FISL: vor 3, 0, 0
+; CHECK-FISL: vcmpequw 2, 2, 3
+; CHECK-FISL: vor 0, 2, 2
+; CHECK-FISL: xxsel 32, 38, 39, 32
 ; CHECK-FISL: vor 2, 0, 0
 ; CHECK-FISL: blr
 
@@ -794,8 +795,6 @@ define <4 x i32> @test34(<4 x i32>* %a) {
 ; CHECK-FISL-LABEL: @test34
 ; CHECK-FISL: lxvw4x 0, 0, 3
 ; CHECK-FISL: xxlor 34, 0, 0
-; CHECK-FISL: vor 3, 2, 2
-; CHECK-FISL: vor 2, 3, 3
 ; CHECK-FISL: blr
 
 ; CHECK-LE-LABEL: @test34
diff --git a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
index 84bbdd75b0f7..6c89b1092bdf 100644
--- a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
+++ b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
@@ -8,9 +8,9 @@ define <2 x double> @testi0(<2 x double>* %p1, double* %p2) {
 
 ; CHECK-LABEL: testi0
 ; CHECK: lxvd2x 0, 0, 3
-; CHECK: lxsdx 34, 0, 4
+; CHECK: lxsdx 1, 0, 4
 ; CHECK: xxswapd 0, 0
-; CHECK: xxspltd 1, 34, 0
+; CHECK: xxspltd 1, 1, 0
 ; CHECK: xxpermdi 34, 0, 1, 1
 }
 
@@ -22,9 +22,9 @@ define <2 x double> @testi1(<2 x double>* %p1, double* %p2) {
 
 ; CHECK-LABEL: testi1
 ; CHECK: lxvd2x 0, 0, 3
-; CHECK: lxsdx 34, 0, 4
+; CHECK: lxsdx 1, 0, 4
 ; CHECK: xxswapd 0, 0
-; CHECK: xxspltd 1, 34, 0
+; CHECK: xxspltd 1, 1, 0
 ; CHECK: xxmrgld 34, 1, 0
 }
 
diff --git a/test/CodeGen/PowerPC/xvcmpeqdp-v2f64.ll b/test/CodeGen/PowerPC/xvcmpeqdp-v2f64.ll
new file mode 100644
index 000000000000..ef63233e746b
--- /dev/null
+++ b/test/CodeGen/PowerPC/xvcmpeqdp-v2f64.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @__fmax_double3_3D_exec() #0 {
+entry:
+  br i1 undef, label %if.then.i, label %fmax_double3.exit
+
+if.then.i:                                        ; preds = %entry
+  %cmp24.i.i = fcmp ord <3 x double> undef, zeroinitializer
+  %sext25.i.i = sext <3 x i1> %cmp24.i.i to <3 x i64>
+  %neg.i.i = xor <3 x i64> %sext25.i.i, <i64 -1, i64 -1, i64 -1>
+  %or.i.i = or <3 x i64> undef, %neg.i.i
+  %neg.i.i.i = select <3 x i1> undef, <3 x i64> zeroinitializer, <3 x i64> %sext25.i.i
+  %and.i.i.i = and <3 x i64> undef, %neg.i.i.i
+  %and26.i.i.i = and <3 x i64> undef, %or.i.i
+  %or.i.i.i = or <3 x i64> %and.i.i.i, %and26.i.i.i
+  %astype32.i.i.i = bitcast <3 x i64> %or.i.i.i to <3 x double>
+  %extractVec33.i.i.i = shufflevector <3 x double> %astype32.i.i.i, <3 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  store <4 x double> %extractVec33.i.i.i, <4 x double>* undef, align 32
+  br label %fmax_double3.exit
+
+; CHECK-LABEL: @__fmax_double3_3D_exec
+; CHECK: xvcmpeqdp
+
+fmax_double3.exit:                                ; preds = %if.then.i, %entry
+  br i1 undef, label %if.then, label %do.end
+
+if.then:                                          ; preds = %fmax_double3.exit
+  unreachable
+
+do.end:                                           ; preds = %fmax_double3.exit
+  ret void
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/SystemZ/args-04.ll b/test/CodeGen/SystemZ/args-04.ll
index 1178bb4dafdf..48a2cf491049 100644
--- a/test/CodeGen/SystemZ/args-04.ll
+++ b/test/CodeGen/SystemZ/args-04.ll
@@ -124,3 +124,17 @@ define void @f13(fp128 *%r2, i16 %r3, i32 %r4, i64 %r5, float %f0, double %f2,
   store fp128 %y, fp128 *%r2
   ret void
 }
+
+; Explicit fp128 return values are likewise passed indirectly.
+define fp128 @f14(fp128 %r3) {
+; CHECK-LABEL: f14:
+; CHECK: ld %f0, 0(%r3)
+; CHECK: ld %f2, 8(%r3)
+; CHECK: axbr %f0, %f0
+; CHECK: std %f0, 0(%r2)
+; CHECK: std %f2, 8(%r2)
+; CHECK: br %r14
+  %y = fadd fp128 %r3, %r3
+  ret fp128 %y
+}
+
diff --git a/test/CodeGen/SystemZ/args-07.ll b/test/CodeGen/SystemZ/args-07.ll
new file mode 100644
index 000000000000..29d9b319ffc0
--- /dev/null
+++ b/test/CodeGen/SystemZ/args-07.ll
@@ -0,0 +1,60 @@
+; Test multiple return values (LLVM ABI extension)
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Up to four integer return values fit into GPRs.
+define { i64, i64, i64, i64 } @f1() {
+; CHECK-LABEL: f1:
+; CHECK: lghi %r2, 0
+; CHECK: lghi %r3, 1
+; CHECK: lghi %r4, 2
+; CHECK: lghi %r5, 3
+; CHECK: br %r14
+  ret { i64, i64, i64, i64 } { i64 0, i64 1, i64 2, i64 3 }
+}
+
+; More than four integer return values use sret.
+define { i64, i64, i64, i64, i64 } @f2() {
+; CHECK-LABEL: f2:
+; CHECK: mvghi 32(%r2), 4
+; CHECK: mvghi 24(%r2), 3
+; CHECK: mvghi 16(%r2), 2
+; CHECK: mvghi 8(%r2), 1
+; CHECK: mvghi 0(%r2), 0
+; CHECK: br %r14
+  ret { i64, i64, i64, i64, i64 } { i64 0, i64 1, i64 2, i64 3, i64 4 }
+}
+
+; Up to four floating-point return values fit into FPRs.
+define { double, double, double, double } @f3() {
+; CHECK-LABEL: f3:
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: ldeb %f0, 0([[TMP]])
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: ldeb %f2, 0([[TMP]])
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: ldeb %f4, 0([[TMP]])
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: ldeb %f6, 0([[TMP]])
+; CHECK: br %r14
+  ret { double, double, double, double }
+      { double 1.0, double 2.0, double 3.0, double 4.0 }
+}
+
+; More than four floating-point return values use sret.
+define { double, double, double, double, double } @f4() {
+; CHECK-LABEL: f4:
+; CHECK: llihh [[TMP:%r[0-5]]], 16404
+; CHECK: stg [[TMP]], 32(%r2)
+; CHECK: llihh [[TMP:%r[0-5]]], 16400
+; CHECK: stg [[TMP]], 24(%r2)
+; CHECK: llihh [[TMP:%r[0-5]]], 16392
+; CHECK: stg [[TMP]], 16(%r2)
+; CHECK: llihh [[TMP:%r[0-5]]], 16384
+; CHECK: stg [[TMP]], 8(%r2)
+; CHECK: llihh [[TMP:%r[0-5]]], 16368
+; CHECK: stg [[TMP]], 0(%r2)
+; CHECK: br %r14
+  ret { double, double, double, double, double }
+      { double 1.0, double 2.0, double 3.0, double 4.0, double 5.0 }
+}
diff --git a/test/CodeGen/SystemZ/args-08.ll b/test/CodeGen/SystemZ/args-08.ll
new file mode 100644
index 000000000000..0bad5a8989dc
--- /dev/null
+++ b/test/CodeGen/SystemZ/args-08.ll
@@ -0,0 +1,57 @@
+; Test calling functions with multiple return values (LLVM ABI extension)
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Up to four integer return values fit into GPRs.
+declare { i64, i64, i64, i64 } @bar1()
+
+define i64 @f1() {
+; CHECK-LABEL: f1:
+; CHECK: brasl %r14, bar1
+; CHECK: lgr %r2, %r5
+; CHECK: br %r14
+  %mret = call { i64, i64, i64, i64 } @bar1()
+  %ret = extractvalue { i64, i64, i64, i64 } %mret, 3
+  ret i64 %ret
+}
+
+; More than four integer return values use sret.
+declare { i64, i64, i64, i64, i64 } @bar2()
+
+define i64 @f2() {
+; CHECK-LABEL: f2:
+; CHECK: la %r2, 160(%r15)
+; CHECK: brasl %r14, bar2
+; CHECK: lg  %r2, 192(%r15)
+; CHECK: br %r14
+  %mret = call { i64, i64, i64, i64, i64 } @bar2()
+  %ret = extractvalue { i64, i64, i64, i64, i64 } %mret, 4
+  ret i64 %ret
+}
+
+; Up to four floating-point return values fit into GPRs.
+declare { double, double, double, double } @bar3()
+
+define double @f3() {
+; CHECK-LABEL: f3:
+; CHECK: brasl %r14, bar3
+; CHECK: ldr %f0, %f6
+; CHECK: br %r14
+  %mret = call { double, double, double, double } @bar3()
+  %ret = extractvalue { double, double, double, double } %mret, 3
+  ret double %ret
+}
+
+; More than four integer return values use sret.
+declare { double, double, double, double, double } @bar4()
+
+define double @f4() {
+; CHECK-LABEL: f4:
+; CHECK: la %r2, 160(%r15)
+; CHECK: brasl %r14, bar4
+; CHECK: ld  %f0, 192(%r15)
+; CHECK: br %r14
+  %mret = call { double, double, double, double, double } @bar4()
+  %ret = extractvalue { double, double, double, double, double } %mret, 4
+  ret double %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-args-06.ll b/test/CodeGen/SystemZ/vec-args-06.ll
new file mode 100644
index 000000000000..b26131ca1d4e
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-args-06.ll
@@ -0,0 +1,83 @@
+; Test multiple return values (LLVM ABI extension)
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Up to eight vector return values fit into VRs.
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double>,
+         <2 x double>, <2 x double>, <2 x double>, <2 x double> } @f1() {
+; CHECK-LABEL: f1:
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl %v24, 0([[TMP]])
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl %v26, 0([[TMP]])
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl %v28, 0([[TMP]])
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl %v30, 0([[TMP]])
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl %v25, 0([[TMP]])
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl %v27, 0([[TMP]])
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl %v29, 0([[TMP]])
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl %v31, 0([[TMP]])
+; CHECK: br %r14
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double>,
+        <2 x double>, <2 x double>, <2 x double>, <2 x double> }
+      { <2 x double> <double 1.0, double 1.1>,
+        <2 x double> <double 2.0, double 2.1>,
+        <2 x double> <double 3.0, double 3.1>,
+        <2 x double> <double 4.0, double 4.1>,
+        <2 x double> <double 5.0, double 5.1>,
+        <2 x double> <double 6.0, double 6.1>,
+        <2 x double> <double 7.0, double 7.1>,
+        <2 x double> <double 8.0, double 8.1> }
+}
+
+; More than eight vector return values use sret.
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double>,
+         <2 x double>, <2 x double>, <2 x double>, <2 x double>,
+         <2 x double> } @f2() {
+; CHECK-LABEL: f2:
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
+; CHECK: vst [[VTMP]], 128(%r2)
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
+; CHECK: vst [[VTMP]], 112(%r2)
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
+; CHECK: vst [[VTMP]], 96(%r2)
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
+; CHECK: vst [[VTMP]], 80(%r2)
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
+; CHECK: vst [[VTMP]], 64(%r2)
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
+; CHECK: vst [[VTMP]], 48(%r2)
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
+; CHECK: vst [[VTMP]], 32(%r2)
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
+; CHECK: vst [[VTMP]], 16(%r2)
+; CHECK: larl [[TMP:%r[0-5]]], .LCPI
+; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
+; CHECK: vst [[VTMP]], 0(%r2)
+; CHECK: br %r14
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double>,
+        <2 x double>, <2 x double>, <2 x double>, <2 x double>,
+        <2 x double> }
+      { <2 x double> <double 1.0, double 1.1>,
+        <2 x double> <double 2.0, double 2.1>,
+        <2 x double> <double 3.0, double 3.1>,
+        <2 x double> <double 4.0, double 4.1>,
+        <2 x double> <double 5.0, double 5.1>,
+        <2 x double> <double 6.0, double 6.1>,
+        <2 x double> <double 7.0, double 7.1>,
+        <2 x double> <double 8.0, double 8.1>,
+        <2 x double> <double 9.0, double 9.1> }
+}
diff --git a/test/CodeGen/SystemZ/vec-args-07.ll b/test/CodeGen/SystemZ/vec-args-07.ll
new file mode 100644
index 000000000000..f0b5e6835cfe
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-args-07.ll
@@ -0,0 +1,47 @@
+; Test calling functions with multiple return values (LLVM ABI extension)
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Up to eight vector return values fit into VRs.
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double>,
+          <2 x double>, <2 x double>, <2 x double>, <2 x double> } @bar1()
+
+define <2 x double> @f1() {
+; CHECK-LABEL: f1:
+; CHECK: brasl %r14, bar1
+; CHECK: vlr %v24, %v31
+; CHECK: br %r14
+  %mret = call { <2 x double>, <2 x double>,
+                 <2 x double>, <2 x double>,
+                 <2 x double>, <2 x double>,
+                 <2 x double>, <2 x double> } @bar1()
+  %ret = extractvalue { <2 x double>, <2 x double>,
+                        <2 x double>, <2 x double>,
+                        <2 x double>, <2 x double>,
+                        <2 x double>, <2 x double> } %mret, 7
+  ret <2 x double> %ret
+}
+
+; More than eight vector return values use sret.
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double>,
+          <2 x double>, <2 x double>, <2 x double>, <2 x double>,
+          <2 x double> } @bar2()
+
+define <2 x double> @f2() {
+; CHECK-LABEL: f2:
+; CHECK: la %r2, 160(%r15)
+; CHECK: brasl %r14, bar2
+; CHECK: vl  %v24, 288(%r15)
+; CHECK: br %r14
+  %mret = call { <2 x double>, <2 x double>,
+                 <2 x double>, <2 x double>,
+                 <2 x double>, <2 x double>,
+                 <2 x double>, <2 x double>,
+                 <2 x double> } @bar2()
+  %ret = extractvalue { <2 x double>, <2 x double>,
+                        <2 x double>, <2 x double>,
+                        <2 x double>, <2 x double>,
+                        <2 x double>, <2 x double>,
+                        <2 x double> } %mret, 8
+  ret <2 x double> %ret
+}
diff --git a/test/CodeGen/X86/fdiv-combine.ll b/test/CodeGen/X86/fdiv-combine.ll
index 34eac62e3673..b65e9d01ab8b 100644
--- a/test/CodeGen/X86/fdiv-combine.ll
+++ b/test/CodeGen/X86/fdiv-combine.ll
@@ -44,5 +44,24 @@ define double @div3_arcp(double %x, double %y, double %z) #0 {
   ret double %ret
 }
 
+define void @PR24141() #0 {
+; CHECK-LABEL: PR24141:
+; CHECK:	callq
+; CHECK-NEXT:	divsd
+; CHECK-NEXT:	jmp
+entry:
+  br label %while.body
+
+while.body:
+  %x.0 = phi double [ undef, %entry ], [ %div, %while.body ]
+  %call = call { double, double } @g(double %x.0)
+  %xv0 = extractvalue { double, double } %call, 0
+  %xv1 = extractvalue { double, double } %call, 1
+  %div = fdiv double %xv0, %xv1
+  br label %while.body
+}
+
+declare { double, double } @g(double)
+
 ; FIXME: If the backend understands 'arcp', then this attribute is unnecessary.
 attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/machine-trace-metrics-crash.ll b/test/CodeGen/X86/machine-trace-metrics-crash.ll
new file mode 100644
index 000000000000..1d0ee79f04a9
--- /dev/null
+++ b/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -0,0 +1,62 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s
+
+; The debug info in this test case was causing a crash because machine trace metrics
+; did not correctly ignore debug instructions. The check lines ensure that the
+; machine-combiner pass has run, reassociated the add operands, and therefore
+; used machine trace metrics.
+
+define void @PR24199() {
+; CHECK-LABEL:	PR24199:
+; CHECK:	addss	%xmm1, %xmm0
+; CHECK:	addss	%xmm2, %xmm0
+
+entry:
+  %i = alloca %struct.A, align 8
+  %tobool = icmp ne i32 undef, 0
+  br i1 undef, label %if.end, label %if.then
+
+if.then:
+  br label %if.end
+
+if.end:
+  %h = phi float [ 0.0, %if.then ], [ 4.0, %entry ]
+  call void @foo(%struct.A* nonnull undef)
+  tail call void @llvm.dbg.value(metadata %struct.A* undef, i64 0, metadata !5, metadata !4), !dbg !6
+  tail call void @llvm.dbg.value(metadata float %h, i64 0, metadata !5, metadata !4), !dbg !6
+  %n0 = load float, float* undef, align 4
+  %mul = fmul fast float %n0, %h
+  %add = fadd fast float %mul, 1.0
+  tail call void @llvm.dbg.value(metadata %struct.A* undef, i64 0, metadata !5, metadata !4), !dbg !6
+  tail call void @llvm.dbg.value(metadata float %add, i64 0, metadata !5, metadata !4), !dbg !6
+  %add.i = fadd fast float %add, %n0
+  store float %add.i, float* undef, align 4
+  %n1 = bitcast %struct.A* %i to i8*
+  call void @llvm.lifetime.start(i64 16, i8* %n1)
+  %n2 = load <2 x float>, <2 x float>* undef, align 8
+  %conv = uitofp i1 %tobool to float
+  %bitcast = extractelement <2 x float> %n2, i32 0
+  %factor = fmul fast float %bitcast, 2.0
+  %add3 = fadd fast float %factor, %conv
+  call void @bar(float %add3)
+  ret void
+}
+
+%struct.A = type { float, float }
+
+declare void @bar(float)
+declare void @foo(%struct.A*)
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: 1)
+!1 = !DIFile(filename: "24199.cpp", directory: "/bin")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DISubprogram(linkageName: "foo", file: !1, line: 18, isLocal: false, isDefinition: true, scopeLine: 18, function: void (%struct.A*)* @foo)
+!4 = !DIExpression()
+!5 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, scope: !3, flags: DIFlagArtificial | DIFlagObjectPointer)
+!6 = !DILocation(line: 0, scope: !3)
+
+
diff --git a/test/CodeGen/X86/pr2656.ll b/test/CodeGen/X86/pr2656.ll
index 9a162d77ef48..095ab831d48d 100644
--- a/test/CodeGen/X86/pr2656.ll
+++ b/test/CodeGen/X86/pr2656.ll
@@ -1,15 +1,24 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
 ; PR2656
 
-; CHECK:     {{xorps.*sp}}
-; CHECK-NOT: {{xorps.*sp}}
-
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin9.4.0"
 	%struct.anon = type <{ float, float }>
 @.str = internal constant [17 x i8] c"pt: %.0f, %.0f\0A\00\00"		; <[17 x i8]*> [#uses=1]
 
+; We can not fold either stack load into an 'xor' instruction because that
+; would change what should be a 4-byte load into a 16-byte load.
+; We can fold the 16-byte constant load into either 'xor' instruction,
+; but we do not. It has more than one use, so it gets loaded into a register.
+
 define void @foo(%struct.anon* byval %p) nounwind {
+; CHECK-LABEL: foo:
+; CHECK:         movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT:    xorps %xmm2, %xmm0
+; CHECK-NEXT:    cvtss2sd %xmm0, %xmm0
+; CHECK-NEXT:    xorps %xmm2, %xmm1
 entry:
 	%tmp = getelementptr %struct.anon, %struct.anon* %p, i32 0, i32 0		; <float*> [#uses=1]
 	%tmp1 = load float, float* %tmp		; <float> [#uses=1]
@@ -24,3 +33,20 @@ entry:
 }
 
 declare i32 @printf(...)
+
+; We can not fold the load from the stack into the 'and' instruction because
+; that changes an 8-byte load into a 16-byte load (illegal memory access).
+; We can fold the load of the constant because it is a 16-byte vector constant.
+
+define double @PR22371(double %x) {
+; CHECK-LABEL: PR22371:
+; CHECK:       movsd  16(%esp), %xmm0
+; CHECK-NEXT:  andpd  LCPI1_0, %xmm0
+; CHECK-NEXT:  movlpd  %xmm0, (%esp)
+  %call = tail call double @fabs(double %x) #0
+  ret double %call
+}
+
+declare double @fabs(double) #0
+attributes #0 = { readnone }
+
diff --git a/test/CodeGen/X86/sse-fcopysign.ll b/test/CodeGen/X86/sse-fcopysign.ll
index 25634b5472aa..8a5462bea82d 100644
--- a/test/CodeGen/X86/sse-fcopysign.ll
+++ b/test/CodeGen/X86/sse-fcopysign.ll
@@ -55,12 +55,12 @@ declare double @copysign(double, double)
 
 define float @int1(float %a, float %b) {
 ; X32-LABEL: @int1
-; X32:       movss 12(%esp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:  movss  8(%esp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:  andps .LCPI2_0, %xmm1
-; X32-NEXT:  andps .LCPI2_1, %xmm0
-; X32-NEXT:  orps  %xmm1, %xmm0
-; X32-NEXT:  movss %xmm0, (%esp)
+; X32:       movss  8(%esp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:  andps .LCPI2_0, %xmm0
+; X32-NEXT:  movss 12(%esp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:  andps .LCPI2_1, %xmm1
+; X32-NEXT:  orps  %xmm0, %xmm1
+; X32-NEXT:  movss %xmm1, (%esp)
 ; X32-NEXT:  flds  (%esp)
 ; X32-NEXT:  popl %eax
 ; X32-NEXT:  retl
@@ -76,14 +76,14 @@ define float @int1(float %a, float %b) {
 
 define double @int2(double %a, float %b, float %c) {
 ; X32-LABEL: @int2
-; X32:       movsd  8(%ebp), %xmm0 {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:  movss 16(%ebp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:  addss 20(%ebp), %xmm1
-; X32-NEXT:  andpd .LCPI3_0, %xmm0
-; X32-NEXT:  cvtss2sd %xmm1, %xmm1
-; X32-NEXT:  andpd .LCPI3_1, %xmm1
-; X32-NEXT:  orpd  %xmm0, %xmm1
-; X32-NEXT:  movsd %xmm1, (%esp)
+; X32:       movss 16(%ebp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:  addss 20(%ebp), %xmm0
+; X32-NEXT:  movsd  8(%ebp), %xmm1 {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT:  andpd .LCPI3_0, %xmm1
+; X32-NEXT:  cvtss2sd %xmm0, %xmm0
+; X32-NEXT:  andpd .LCPI3_1, %xmm0
+; X32-NEXT:  orpd  %xmm1, %xmm0
+; X32-NEXT:  movlpd %xmm0, (%esp)
 ; X32-NEXT:  fldl  (%esp)
 ; X32-NEXT:  movl %ebp, %esp
 ; X32-NEXT:  popl %ebp
@@ -91,9 +91,9 @@ define double @int2(double %a, float %b, float %c) {
 ;
 ; X64-LABEL: @int2
 ; X64:       addss %xmm2, %xmm1
-; X64-NEXT:  andpd .LCPI3_0(%rip), %xmm0
 ; X64-NEXT:  cvtss2sd %xmm1, %xmm1
-; X64-NEXT:  andpd .LCPI3_1(%rip), %xmm1
+; X64-NEXT:  andpd .LCPI3_0(%rip), %xmm1
+; X64-NEXT:  andpd .LCPI3_1(%rip), %xmm0
 ; X64-NEXT:  orpd %xmm1, %xmm0
 ; X64-NEXT:  retq
   %tmp1 = fadd float %b, %c
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
index bfefbcf5ebd3..960b5f27cf53 100644
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -4,7 +4,7 @@
 define <2 x double> @fabs_v2f64(<2 x double> %p)
 {
   ; CHECK-LABEL: fabs_v2f64
-  ; CHECK: vandps
+  ; CHECK: vandpd
   %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
@@ -22,7 +22,7 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
 define <4 x double> @fabs_v4f64(<4 x double> %p)
 {
   ; CHECK-LABEL: fabs_v4f64
-  ; CHECK: vandps
+  ; CHECK: vandpd
   %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
diff --git a/test/DebugInfo/Mips/delay-slot.ll b/test/DebugInfo/Mips/delay-slot.ll
index bbf749c82ab9..df01775a12e6 100644
--- a/test/DebugInfo/Mips/delay-slot.ll
+++ b/test/DebugInfo/Mips/delay-slot.ll
@@ -13,12 +13,14 @@
 ; CHECK: Address            Line   Column File   ISA Discriminator Flags
 ; CHECK: ------------------ ------ ------ ------ --- ------------- -------------
 ; CHECK: 0x0000000000000000      1      0      1   0             0  is_stmt
-; CHECK: 0x0000000000000000      1      0      1   0             0  is_stmt prologue_end
-; CHECK: 0x0000000000000008      2      0      1   0             0  is_stmt
-; CHECK: 0x0000000000000020      3      0      1   0             0  is_stmt
-; CHECK: 0x0000000000000030      4      0      1   0             0  is_stmt
-; CHECK: 0x0000000000000040      5      0      1   0             0  is_stmt
-; CHECK: 0x0000000000000050      5      0      1   0             0  is_stmt end_sequence
+; FIXME: The next address probably ought to be 0x0000000000000004 but there's
+;        a constant initialization before the prologue's end.
+; CHECK: 0x0000000000000008      2      0      1   0             0  is_stmt prologue_end
+; CHECK: 0x0000000000000028      3      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000038      4      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000048      5      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000058      5      0      1   0             0  is_stmt end_sequence
+
 
 target datalayout = "E-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64"
 target triple = "mips--linux-gnu"
diff --git a/test/MC/AMDGPU/vopc.s b/test/MC/AMDGPU/vopc.s
index f44919a4f1e0..2d8547c5f953 100644
--- a/test/MC/AMDGPU/vopc.s
+++ b/test/MC/AMDGPU/vopc.s
@@ -1,5 +1,6 @@
-// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s --check-prefix=SICI
+// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s --check-prefix=SICI
+// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=VI
 
 //===----------------------------------------------------------------------===//
 // Generic Checks
@@ -7,23 +8,28 @@
 
 // src0 sgpr
 v_cmp_lt_f32 vcc, s2, v4
-// CHECK: v_cmp_lt_f32_e32 vcc, s2, v4 ; encoding: [0x02,0x08,0x02,0x7c]
+// SICI: v_cmp_lt_f32_e32 vcc, s2, v4 ; encoding: [0x02,0x08,0x02,0x7c]
+// VI:   v_cmp_lt_f32_e32 vcc, s2, v4 ; encoding: [0x02,0x08,0x82,0x7c]
 
 // src0 inline immediate
 v_cmp_lt_f32 vcc, 0, v4
-// CHECK: v_cmp_lt_f32_e32 vcc, 0, v4 ; encoding: [0x80,0x08,0x02,0x7c]
+// SICI: v_cmp_lt_f32_e32 vcc, 0, v4 ; encoding: [0x80,0x08,0x02,0x7c]
+// VI:   v_cmp_lt_f32_e32 vcc, 0, v4 ; encoding: [0x80,0x08,0x82,0x7c]
 
 // src0 literal
 v_cmp_lt_f32 vcc, 10.0, v4
-// CHECK: v_cmp_lt_f32_e32 vcc, 0x41200000, v4 ; encoding: [0xff,0x08,0x02,0x7c,0x00,0x00,0x20,0x41]
+// SICI: v_cmp_lt_f32_e32 vcc, 0x41200000, v4 ; encoding: [0xff,0x08,0x02,0x7c,0x00,0x00,0x20,0x41]
+// VI:   v_cmp_lt_f32_e32 vcc, 0x41200000, v4 ; encoding: [0xff,0x08,0x82,0x7c,0x00,0x00,0x20,0x41]
 
 // src0, src1 max vgpr
 v_cmp_lt_f32 vcc, v255, v255
-// CHECK: v_cmp_lt_f32_e32 vcc, v255, v255 ; encoding: [0xff,0xff,0x03,0x7c]
+// SICI: v_cmp_lt_f32_e32 vcc, v255, v255 ; encoding: [0xff,0xff,0x03,0x7c]
+// VI:   v_cmp_lt_f32_e32 vcc, v255, v255 ; encoding: [0xff,0xff,0x83,0x7c]
 
 // force 32-bit encoding
 v_cmp_lt_f32_e32 vcc, v2, v4
-// CHECK: v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x02,0x7c]
+// SICI: v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x02,0x7c]
+// VI:   v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x82,0x7c]
 
 
 //===----------------------------------------------------------------------===//
@@ -31,10 +37,12 @@ v_cmp_lt_f32_e32 vcc, v2, v4
 //===----------------------------------------------------------------------===//
 
 v_cmp_f_f32 vcc, v2, v4
-// CHECK: v_cmp_f_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x00,0x7c]
+// SICI: v_cmp_f_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x00,0x7c]
+// VI:   v_cmp_f_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x80,0x7c]
 
 v_cmp_lt_f32 vcc, v2, v4
-// CHECK: v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x02,0x7c]
+// SICI: v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x02,0x7c]
+// VI:   v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x82,0x7c]
 
 // TODO: Add tests for the rest of the instructions.
 
diff --git a/test/MC/Disassembler/PowerPC/ppc64le-encoding.txt b/test/MC/Disassembler/PowerPC/ppc64le-encoding.txt
new file mode 100644
index 000000000000..f154e00ff51c
--- /dev/null
+++ b/test/MC/Disassembler/PowerPC/ppc64le-encoding.txt
@@ -0,0 +1,664 @@
+# RUN: llvm-mc --disassemble %s -triple powerpc64le-unknown-unknown -mcpu=pwr7 | FileCheck %s
+
+# FIXME: test b target
+
+# FIXME: test ba target
+
+# FIXME: test bl target
+
+# FIXME: test bla target
+
+# FIXME: test bc 4, 10, target
+
+# FIXME: test bca 4, 10, target
+
+# FIXME: test bcl 4, 10, target
+
+# FIXME: test bcla 4, 10, target
+
+# CHECK: bclr 4, 10, 3
+0x20 0x18 0x8a 0x4c
+
+# CHECK: bclr 4, 10
+0x20 0x00 0x8a 0x4c
+
+# CHECK: bclrl 4, 10, 3
+0x21 0x18 0x8a 0x4c
+
+# CHECK: bclrl 4, 10
+0x21 0x00 0x8a 0x4c
+
+# CHECK: bcctr 4, 10, 3
+0x20 0x1c 0x8a 0x4c
+
+# CHECK: bcctr 4, 10
+0x20 0x04 0x8a 0x4c
+
+# CHECK: bcctrl 4, 10, 3
+0x21 0x1c 0x8a 0x4c
+
+# CHECK: bcctrl 4, 10
+0x21 0x04 0x8a 0x4c
+
+# CHECK: crand 2, 3, 4
+0x02 0x22 0x43 0x4c
+
+# CHECK: crnand 2, 3, 4
+0xc2 0x21 0x43 0x4c
+
+# CHECK: cror 2, 3, 4
+0x82 0x23 0x43 0x4c
+
+# CHECK: crxor 2, 3, 4
+0x82 0x21 0x43 0x4c
+
+# CHECK: crnor 2, 3, 4
+0x42 0x20 0x43 0x4c
+
+# CHECK: creqv 2, 3, 4
+0x42 0x22 0x43 0x4c
+
+# CHECK: crandc 2, 3, 4
+0x02 0x21 0x43 0x4c
+
+# CHECK: crorc 2, 3, 4
+0x42 0x23 0x43 0x4c
+
+# CHECK: mcrf 2, 3
+0x00 0x00 0x0c 0x4d
+
+# CHECK: sc 1
+0x22 0x00 0x00 0x44
+
+# CHECK: sc
+0x02 0x00 0x00 0x44
+
+# CHECK: clrbhrb
+0x5c 0x03 0x00 0x7c
+
+# CHECK: mfbhrbe 9, 983
+0x5c 0xba 0x3e 0x7d
+
+# CHECK: rfebb 1
+0x24 0x09 0x00 0x4c
+
+# CHECK: lbz 2, 128(4)
+0x80 0x00 0x44 0x88
+
+# CHECK: lbzx 2, 3, 4
+0xae 0x20 0x43 0x7c
+
+# CHECK: lbzu 2, 128(4)
+0x80 0x00 0x44 0x8c
+
+# CHECK: lbzux 2, 3, 4
+0xee 0x20 0x43 0x7c
+
+# CHECK: lhz 2, 128(4)
+0x80 0x00 0x44 0xa0
+
+# CHECK: lhzx 2, 3, 4
+0x2e 0x22 0x43 0x7c
+
+# CHECK: lhzu 2, 128(4)
+0x80 0x00 0x44 0xa4
+
+# CHECK: lhzux 2, 3, 4
+0x6e 0x22 0x43 0x7c
+
+# CHECK: lha 2, 128(4)
+0x80 0x00 0x44 0xa8
+
+# CHECK: lhax 2, 3, 4
+0xae 0x22 0x43 0x7c
+
+# CHECK: lhau 2, 128(4)
+0x80 0x00 0x44 0xac
+
+# CHECK: lhaux 2, 3, 4
+0xee 0x22 0x43 0x7c
+
+# CHECK: lwz 2, 128(4)
+0x80 0x00 0x44 0x80
+
+# CHECK: lwzx 2, 3, 4
+0x2e 0x20 0x43 0x7c
+
+# CHECK: lwzu 2, 128(4)
+0x80 0x00 0x44 0x84
+
+# CHECK: lwzux 2, 3, 4
+0x6e 0x20 0x43 0x7c
+
+# CHECK: lwa 2, 128(4)
+0x82 0x00 0x44 0xe8
+
+# CHECK: lwax 2, 3, 4
+0xaa 0x22 0x43 0x7c
+
+# CHECK: lwaux 2, 3, 4
+0xea 0x22 0x43 0x7c
+
+# CHECK: ld 2, 128(4)
+0x80 0x00 0x44 0xe8
+
+# CHECK: ldx 2, 3, 4
+0x2a 0x20 0x43 0x7c
+
+# CHECK: ldu 2, 128(4)
+0x81 0x00 0x44 0xe8
+
+# CHECK: ldux 2, 3, 4
+0x6a 0x20 0x43 0x7c
+
+# CHECK: stb 2, 128(4)
+0x80 0x00 0x44 0x98
+
+# CHECK: stbx 2, 3, 4
+0xae 0x21 0x43 0x7c
+
+# CHECK: stbu 2, 128(4)
+0x80 0x00 0x44 0x9c
+
+# CHECK: stbux 2, 3, 4
+0xee 0x21 0x43 0x7c
+
+# CHECK: sth 2, 128(4)
+0x80 0x00 0x44 0xb0
+
+# CHECK: sthx 2, 3, 4
+0x2e 0x23 0x43 0x7c
+
+# CHECK: sthu 2, 128(4)
+0x80 0x00 0x44 0xb4
+
+# CHECK: sthux 2, 3, 4
+0x6e 0x23 0x43 0x7c
+
+# CHECK: stw 2, 128(4)
+0x80 0x00 0x44 0x90
+
+# CHECK: stwx 2, 3, 4
+0x2e 0x21 0x43 0x7c
+
+# CHECK: stwu 2, 128(4)
+0x80 0x00 0x44 0x94
+
+# CHECK: stwux 2, 3, 4
+0x6e 0x21 0x43 0x7c
+
+# CHECK: std 2, 128(4)
+0x80 0x00 0x44 0xf8
+
+# CHECK: stdx 2, 3, 4
+0x2a 0x21 0x43 0x7c
+
+# CHECK: stdu 2, 128(4)
+0x81 0x00 0x44 0xf8
+
+# CHECK: stdux 2, 3, 4
+0x6a 0x21 0x43 0x7c
+
+# CHECK: lhbrx 2, 3, 4
+0x2c 0x26 0x43 0x7c
+
+# CHECK: sthbrx 2, 3, 4
+0x2c 0x27 0x43 0x7c
+
+# CHECK: lwbrx 2, 3, 4
+0x2c 0x24 0x43 0x7c
+
+# CHECK: stwbrx 2, 3, 4
+0x2c 0x25 0x43 0x7c
+
+# CHECK: ldbrx 2, 3, 4
+0x28 0x24 0x43 0x7c
+
+# CHECK: stdbrx 2, 3, 4
+0x28 0x25 0x43 0x7c
+
+# CHECK: lmw 2, 128(1)
+0x80 0x00 0x41 0xb8
+
+# CHECK: stmw 2, 128(1)
+0x80 0x00 0x41 0xbc
+
+# CHECK: addi 2, 3, 128
+0x80 0x00 0x43 0x38
+
+# CHECK: addis 2, 3, 128
+0x80 0x00 0x43 0x3c
+
+# CHECK: add 2, 3, 4
+0x14 0x22 0x43 0x7c
+
+# CHECK: add. 2, 3, 4
+0x15 0x22 0x43 0x7c
+
+# CHECK: subf 2, 3, 4
+0x50 0x20 0x43 0x7c
+
+# CHECK: subf. 2, 3, 4
+0x51 0x20 0x43 0x7c
+
+# CHECK: addic 2, 3, 128
+0x80 0x00 0x43 0x30
+
+# CHECK: addic. 2, 3, 128
+0x80 0x00 0x43 0x34
+
+# CHECK: subfic 2, 3, 4
+0x04 0x00 0x43 0x20
+
+# CHECK: addc 2, 3, 4
+0x14 0x20 0x43 0x7c
+
+# CHECK: addc. 2, 3, 4
+0x15 0x20 0x43 0x7c
+
+# CHECK: subfc 2, 3, 4
+0x10 0x20 0x43 0x7c
+
+# CHECK: subfc 2, 3, 4
+0x10 0x20 0x43 0x7c
+
+# CHECK: adde 2, 3, 4
+0x14 0x21 0x43 0x7c
+
+# CHECK: adde. 2, 3, 4
+0x15 0x21 0x43 0x7c
+
+# CHECK: subfe 2, 3, 4
+0x10 0x21 0x43 0x7c
+
+# CHECK: subfe. 2, 3, 4
+0x11 0x21 0x43 0x7c
+
+# CHECK: addme 2, 3
+0xd4 0x01 0x43 0x7c
+
+# CHECK: addme. 2, 3
+0xd5 0x01 0x43 0x7c
+
+# CHECK: subfme 2, 3
+0xd0 0x01 0x43 0x7c
+
+# CHECK: subfme. 2, 3
+0xd1 0x01 0x43 0x7c
+
+# CHECK: addze 2, 3
+0x94 0x01 0x43 0x7c
+
+# CHECK: addze. 2, 3
+0x95 0x01 0x43 0x7c
+
+# CHECK: subfze 2, 3
+0x90 0x01 0x43 0x7c
+
+# CHECK: subfze. 2, 3
+0x91 0x01 0x43 0x7c
+
+# CHECK: neg 2, 3
+0xd0 0x00 0x43 0x7c
+
+# CHECK: neg. 2, 3
+0xd1 0x00 0x43 0x7c
+
+# CHECK: mulli 2, 3, 128
+0x80 0x00 0x43 0x1c
+
+# CHECK: mulhw 2, 3, 4
+0x96 0x20 0x43 0x7c
+
+# CHECK: mulhw. 2, 3, 4
+0x97 0x20 0x43 0x7c
+
+# CHECK: mullw 2, 3, 4
+0xd6 0x21 0x43 0x7c
+
+# CHECK: mullw. 2, 3, 4
+0xd7 0x21 0x43 0x7c
+
+# CHECK: mulhwu 2, 3, 4
+0x16 0x20 0x43 0x7c
+
+# CHECK: mulhwu. 2, 3, 4
+0x17 0x20 0x43 0x7c
+
+# CHECK: divw 2, 3, 4
+0xd6 0x23 0x43 0x7c
+
+# CHECK: divw. 2, 3, 4
+0xd7 0x23 0x43 0x7c
+
+# CHECK: divwu 2, 3, 4
+0x96 0x23 0x43 0x7c
+
+# CHECK: divwu. 2, 3, 4
+0x97 0x23 0x43 0x7c
+
+# CHECK: divwe 2, 3, 4
+0x56 0x23 0x43 0x7c
+
+# CHECK: divwe. 2, 3, 4
+0x57 0x23 0x43 0x7c
+
+# CHECK: divweu 2, 3, 4
+0x16 0x23 0x43 0x7c
+
+# CHECK: divweu. 2, 3, 4
+0x17 0x23 0x43 0x7c
+
+# CHECK: mulld 2, 3, 4
+0xd2 0x21 0x43 0x7c
+
+# CHECK: mulld. 2, 3, 4
+0xd3 0x21 0x43 0x7c
+
+# CHECK: mulhd 2, 3, 4
+0x92 0x20 0x43 0x7c
+
+# CHECK: mulhd. 2, 3, 4
+0x93 0x20 0x43 0x7c
+
+# CHECK: mulhdu 2, 3, 4
+0x12 0x20 0x43 0x7c
+
+# CHECK: mulhdu. 2, 3, 4
+0x13 0x20 0x43 0x7c
+
+# CHECK: divd 2, 3, 4
+0xd2 0x23 0x43 0x7c
+
+# CHECK: divd. 2, 3, 4
+0xd3 0x23 0x43 0x7c
+
+# CHECK: divdu 2, 3, 4
+0x92 0x23 0x43 0x7c
+
+# CHECK: divdu. 2, 3, 4
+0x93 0x23 0x43 0x7c
+
+# CHECK: divde 2, 3, 4
+0x52 0x23 0x43 0x7c
+
+# CHECK: divde. 2, 3, 4
+0x53 0x23 0x43 0x7c
+
+# CHECK: divdeu 2, 3, 4
+0x12 0x23 0x43 0x7c
+
+# CHECK: divdeu. 2, 3, 4
+0x13 0x23 0x43 0x7c
+
+# CHECK: cmpdi 2, 3, 128
+0x80 0x00 0x23 0x2d
+
+# CHECK: cmpd 2, 3, 4
+0x00 0x20 0x23 0x7d
+
+# CHECK: cmpldi 2, 3, 128
+0x80 0x00 0x23 0x29
+
+# CHECK: cmpld 2, 3, 4
+0x40 0x20 0x23 0x7d
+
+# CHECK: cmpwi 2, 3, 128
+0x80 0x00 0x03 0x2d
+
+# CHECK: cmpw 2, 3, 4
+0x00 0x20 0x03 0x7d
+
+# CHECK: cmplwi 2, 3, 128
+0x80 0x00 0x03 0x29
+
+# CHECK: cmplw 2, 3, 4
+0x40 0x20 0x03 0x7d
+
+# CHECK: twllti 3, 4
+0x04 0x00 0x43 0x0c
+
+# CHECK: twllt 3, 4
+0x08 0x20 0x43 0x7c
+
+# CHECK: tdllti 3, 4
+0x04 0x00 0x43 0x08
+
+# CHECK: tdllt 3, 4
+0x88 0x20 0x43 0x7c
+
+# CHECK: isel 2, 3, 4, 5
+0x5e 0x21 0x43 0x7c
+
+# CHECK: andi. 2, 3, 128
+0x80 0x00 0x62 0x70
+
+# CHECK: andis. 2, 3, 128
+0x80 0x00 0x62 0x74
+
+# CHECK: ori 2, 3, 128
+0x80 0x00 0x62 0x60
+
+# CHECK: oris 2, 3, 128
+0x80 0x00 0x62 0x64
+
+# CHECK: xori 2, 3, 128
+0x80 0x00 0x62 0x68
+
+# CHECK: xoris 2, 3, 128
+0x80 0x00 0x62 0x6c
+
+# CHECK: and 2, 3, 4
+0x38 0x20 0x62 0x7c
+
+# CHECK: and. 2, 3, 4
+0x39 0x20 0x62 0x7c
+
+# CHECK: xor 2, 3, 4
+0x78 0x22 0x62 0x7c
+
+# CHECK: xor. 2, 3, 4
+0x79 0x22 0x62 0x7c
+
+# CHECK: nand 2, 3, 4
+0xb8 0x23 0x62 0x7c
+
+# CHECK: nand. 2, 3, 4
+0xb9 0x23 0x62 0x7c
+
+# CHECK: or 2, 3, 4
+0x78 0x23 0x62 0x7c
+
+# CHECK: or. 2, 3, 4
+0x79 0x23 0x62 0x7c
+
+# CHECK: nor 2, 3, 4
+0xf8 0x20 0x62 0x7c
+
+# CHECK: nor. 2, 3, 4
+0xf9 0x20 0x62 0x7c
+
+# CHECK: eqv 2, 3, 4
+0x38 0x22 0x62 0x7c
+
+# CHECK: eqv. 2, 3, 4
+0x39 0x22 0x62 0x7c
+
+# CHECK: andc 2, 3, 4
+0x78 0x20 0x62 0x7c
+
+# CHECK: andc. 2, 3, 4
+0x79 0x20 0x62 0x7c
+
+# CHECK: orc 2, 3, 4
+0x38 0x23 0x62 0x7c
+
+# CHECK: orc. 2, 3, 4
+0x39 0x23 0x62 0x7c
+
+# CHECK: extsb 2, 3
+0x74 0x07 0x62 0x7c
+
+# CHECK: extsb. 2, 3
+0x75 0x07 0x62 0x7c
+
+# CHECK: extsh 2, 3
+0x34 0x07 0x62 0x7c
+
+# CHECK: extsh. 2, 3
+0x35 0x07 0x62 0x7c
+
+# CHECK: cntlz 2, 3
+0x34 0x00 0x62 0x7c
+
+# CHECK: cntlz. 2, 3
+0x35 0x00 0x62 0x7c
+
+# CHECK: popcntw 2, 3
+0xf4 0x02 0x62 0x7c
+
+# CHECK: extsw 2, 3
+0xb4 0x07 0x62 0x7c
+
+# CHECK: extsw. 2, 3
+0xb5 0x07 0x62 0x7c
+
+# CHECK: cntlzd 2, 3
+0x74 0x00 0x62 0x7c
+
+# CHECK: cntlzd. 2, 3
+0x75 0x00 0x62 0x7c
+
+# CHECK: popcntd 2, 3
+0xf4 0x03 0x62 0x7c
+
+# CHECK: bpermd 2, 3, 4
+0xf8 0x21 0x62 0x7c
+
+# CHECK: cmpb 7, 21, 4
+0xf8 0x23 0xa7 0x7e
+
+# CHECK: rlwinm 2, 3, 4, 5, 6
+0x4c 0x21 0x62 0x54
+
+# CHECK: rlwinm. 2, 3, 4, 5, 6
+0x4d 0x21 0x62 0x54
+
+# CHECK: rlwnm 2, 3, 4, 5, 6
+0x4c 0x21 0x62 0x5c
+
+# CHECK: rlwnm. 2, 3, 4, 5, 6
+0x4d 0x21 0x62 0x5c
+
+# CHECK: rlwimi 2, 3, 4, 5, 6
+0x4c 0x21 0x62 0x50
+
+# CHECK: rlwimi. 2, 3, 4, 5, 6
+0x4d 0x21 0x62 0x50
+
+# CHECK: rldicl 2, 3, 4, 5
+0x40 0x21 0x62 0x78
+
+# CHECK: rldicl. 2, 3, 4, 5
+0x41 0x21 0x62 0x78
+
+# CHECK: rldicr 2, 3, 4, 5
+0x44 0x21 0x62 0x78
+
+# CHECK: rldicr. 2, 3, 4, 5
+0x45 0x21 0x62 0x78
+
+# CHECK: rldic 2, 3, 4, 5
+0x48 0x21 0x62 0x78
+
+# CHECK: rldic. 2, 3, 4, 5
+0x49 0x21 0x62 0x78
+
+# CHECK: rldcl 2, 3, 4, 5
+0x50 0x21 0x62 0x78
+
+# CHECK: rldcl. 2, 3, 4, 5
+0x51 0x21 0x62 0x78
+
+# CHECK: rldcr 2, 3, 4, 5
+0x52 0x21 0x62 0x78
+
+# CHECK: rldcr. 2, 3, 4, 5
+0x53 0x21 0x62 0x78
+
+# CHECK: rldimi 2, 3, 4, 5
+0x4c 0x21 0x62 0x78
+
+# CHECK: rldimi. 2, 3, 4, 5
+0x4d 0x21 0x62 0x78
+
+# CHECK: slw 2, 3, 4
+0x30 0x20 0x62 0x7c
+
+# CHECK: slw. 2, 3, 4
+0x31 0x20 0x62 0x7c
+
+# CHECK: srw 2, 3, 4
+0x30 0x24 0x62 0x7c
+
+# CHECK: srw. 2, 3, 4
+0x31 0x24 0x62 0x7c
+
+# CHECK: srawi 2, 3, 4
+0x70 0x26 0x62 0x7c
+
+# CHECK: srawi. 2, 3, 4
+0x71 0x26 0x62 0x7c
+
+# CHECK: sraw 2, 3, 4
+0x30 0x26 0x62 0x7c
+
+# CHECK: sraw. 2, 3, 4
+0x31 0x26 0x62 0x7c
+
+# CHECK: sld 2, 3, 4
+0x36 0x20 0x62 0x7c
+
+# CHECK: sld. 2, 3, 4
+0x37 0x20 0x62 0x7c
+
+# CHECK: srd 2, 3, 4
+0x36 0x24 0x62 0x7c
+
+# CHECK: srd. 2, 3, 4
+0x37 0x24 0x62 0x7c
+
+# CHECK: sradi 2, 3, 4
+0x74 0x26 0x62 0x7c
+
+# CHECK: sradi. 2, 3, 4
+0x75 0x26 0x62 0x7c
+
+# CHECK: srad 2, 3, 4
+0x34 0x26 0x62 0x7c
+
+# CHECK: srad. 2, 3, 4
+0x35 0x26 0x62 0x7c
+
+# CHECK: mtspr 600, 2
+0xa6 0x93 0x58 0x7c
+
+# CHECK: mfspr 2, 600
+0xa6 0x92 0x58 0x7c
+
+# CHECK: mtcrf 123, 2
+0x20 0xb1 0x47 0x7c
+
+# CHECK: mfcr 2
+0x26 0x00 0x40 0x7c
+
+# CHECK: mtocrf 16, 2
+0x20 0x01 0x51 0x7c
+
+# CHECK: mfocrf 16, 8
+0x26 0x80 0x10 0x7e
+
+# CHECK: mtsrin 10, 12
+0xe4 0x61 0x40 0x7d
+# CHECK: mfsrin 10, 12
+0x26 0x65 0x40 0x7d
diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s
index 30fe6c8b9b15..6fde42bd898d 100644
--- a/test/MC/X86/intel-syntax.s
+++ b/test/MC/X86/intel-syntax.s
@@ -665,3 +665,17 @@ frstor dword ptr [eax]
 
 // CHECK: cmpnless %xmm1, %xmm0
 cmpnless xmm0, xmm1
+
+insb
+insw
+insd
+// CHECK: insb %dx, %es:(%rdi)
+// CHECK: insw %dx, %es:(%rdi)
+// CHECK: insl %dx, %es:(%rdi)
+
+outsb
+outsw
+outsd
+// CHECK: outsb (%rsi), %dx
+// CHECK: outsw (%rsi), %dx
+// CHECK: outsl (%rsi), %dx
diff --git a/test/Object/archive-extract.test b/test/Object/archive-extract.test
index a77adf2cabbd..50372d530d88 100644
--- a/test/Object/archive-extract.test
+++ b/test/Object/archive-extract.test
@@ -53,4 +53,4 @@ RUN: llvm-ar p %p/Inputs/thin.a evenlen | FileCheck %s --check-prefix=EVENLEN
 EVENLEN: evenlen
 
 RUN: not llvm-ar p %p/Inputs/thin-path.a t/test2.o | FileCheck %s --check-prefix=MISSING
-MISSING: No such file or directory.
+MISSING: {{N|n}}o such file or directory.
diff --git a/test/Transforms/GVN/pr24397.ll b/test/Transforms/GVN/pr24397.ll
new file mode 100644
index 000000000000..db43964e2e4c
--- /dev/null
+++ b/test/Transforms/GVN/pr24397.ll
@@ -0,0 +1,18 @@
+; RUN: opt -basicaa -gvn -disable-output < %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define i64 @foo(i64** %arrayidx) {
+entry:
+  %p = load i64*, i64** %arrayidx, align 8
+  %cmpnull = icmp eq i64* %p, null
+  br label %BB2
+
+entry2:                                           ; No predecessors!
+  br label %BB2
+
+BB2:                                              ; preds = %entry2, %entry
+  %bc = bitcast i64** %arrayidx to i64*
+  %load = load i64, i64* %bc, align 8
+  ret i64 %load
+}
diff --git a/test/Transforms/InstCombine/pr24354.ll b/test/Transforms/InstCombine/pr24354.ll
new file mode 100644
index 000000000000..3b36fd1b74e3
--- /dev/null
+++ b/test/Transforms/InstCombine/pr24354.ll
@@ -0,0 +1,33 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+; This used to crash opt
+
+@c = common global i32 0, align 4
+@b = common global i32 0, align 4
+@a = common global i16 0, align 2
+@d = common global i32 0, align 4
+
+define void @fn3() {
+; CHECK: @fn3
+bb:
+  %tmp = load i32, i32* @c, align 4
+  %tmp1 = icmp eq i32 %tmp, 0
+  br i1 %tmp1, label %bb2, label %bb6
+
+bb2:                                              ; preds = %bb
+  %tmp3 = load i32, i32* @b, align 4
+  %tmp.i = add nsw i32 255, %tmp3
+  %tmp5 = icmp ugt i32 %tmp.i, 254
+  br label %bb6
+
+bb6:                                              ; preds = %bb, %bb2
+  %tmp7 = phi i1 [ true, %bb ], [ %tmp5, %bb2 ]
+  %tmp8 = zext i1 %tmp7 to i32
+  %tmp10 = icmp eq i32 %tmp8, 0
+  %tmp12 = load i16, i16* @a, align 2
+  %tmp14 = icmp ne i16 %tmp12, 0
+  %tmp16 = select i1 %tmp10, i1 false, i1 %tmp14
+  %tmp17 = zext i1 %tmp16 to i32
+  store i32 %tmp17, i32* @d, align 4
+  ret void
+}
diff --git a/test/Transforms/InstCombine/vector-casts.ll b/test/Transforms/InstCombine/vector-casts.ll
index 727eb4ebb4c8..af18b4cfbdd1 100644
--- a/test/Transforms/InstCombine/vector-casts.ll
+++ b/test/Transforms/InstCombine/vector-casts.ll
@@ -150,3 +150,14 @@ entry:
   ret <4 x float> undef
 }
 
+define <8 x i32> @pr24458(<8 x float> %n) {
+; CHECK-LABEL: @pr24458
+  %notequal_b_load_.i = fcmp une <8 x float> %n, zeroinitializer
+  %equal_a_load72_.i = fcmp ueq <8 x float> %n, zeroinitializer
+  %notequal_b_load__to_boolvec.i = sext <8 x i1> %notequal_b_load_.i to <8 x i32>
+  %equal_a_load72__to_boolvec.i = sext <8 x i1> %equal_a_load72_.i to <8 x i32>
+  %wrong = or <8 x i32> %notequal_b_load__to_boolvec.i, %equal_a_load72__to_boolvec.i
+  ret <8 x i32> %wrong
+; CHECK-NEXT: ret <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+}
+
diff --git a/test/Transforms/InstSimplify/2011-09-05-InsertExtractValue.ll b/test/Transforms/InstSimplify/2011-09-05-InsertExtractValue.ll
index 7e391aba3045..441bc1adca7e 100644
--- a/test/Transforms/InstSimplify/2011-09-05-InsertExtractValue.ll
+++ b/test/Transforms/InstSimplify/2011-09-05-InsertExtractValue.ll
@@ -36,3 +36,13 @@ define i32 @test3(i32 %a, float %b) {
 ; CHECK-LABEL: @test3(
 ; CHECK: ret i32 %a
 }
+
+define i8 @test4(<8 x i8> %V) {
+  %add     = add <8 x i8> %V, bitcast (double 0x319BEB8FD172E36 to <8 x i8>)
+  %extract = extractelement <8 x i8> %add, i32 6
+  ret i8 %extract
+; CHECK-LABEL: @test4(
+; CHECK: %[[add:.*]] = add <8 x i8> %V, bitcast (<1 x double> <double 0x319BEB8FD172E36> to <8 x i8>)
+; CHECK-NEXT: %[[extract:.*]] = extractelement <8 x i8> %[[add]], i32 6
+; CHECK-NEXT: ret i8 %[[extract]]
+}
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
index 7c8955b28fa2..ad2794167a5e 100644
--- a/test/Transforms/SROA/basictest.ll
+++ b/test/Transforms/SROA/basictest.ll
@@ -1195,20 +1195,24 @@ entry:
   %a = alloca <{ i1 }>, align 8
   %b = alloca <{ i1 }>, align 8
 ; CHECK:      %[[a:.*]] = alloca i8, align 8
+; CHECK-NEXT: %[[b:.*]] = alloca i8, align 8
 
   %b.i1 = bitcast <{ i1 }>* %b to i1*
   store i1 %x, i1* %b.i1, align 8
   %b.i8 = bitcast <{ i1 }>* %b to i8*
   %foo = load i8, i8* %b.i8, align 1
-; CHECK-NEXT: %[[ext:.*]] = zext i1 %x to i8
-; CHECK-NEXT: store i8 %[[ext]], i8* %[[a]], align 8
-; CHECK-NEXT: {{.*}} = load i8, i8* %[[a]], align 8
+; CHECK-NEXT: %[[b_cast:.*]] = bitcast i8* %[[b]] to i1*
+; CHECK-NEXT: store i1 %x, i1* %[[b_cast]], align 8
+; CHECK-NEXT: {{.*}} = load i8, i8* %[[b]], align 8
 
   %a.i8 = bitcast <{ i1 }>* %a to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.i8, i8* %b.i8, i32 1, i32 1, i1 false) nounwind
   %bar = load i8, i8* %a.i8, align 1
   %a.i1 = getelementptr inbounds <{ i1 }>, <{ i1 }>* %a, i32 0, i32 0
   %baz = load i1, i1* %a.i1, align 1
+; CHECK-NEXT: %[[copy:.*]] = load i8, i8* %[[b]], align 8
+; CHECK-NEXT: store i8 %[[copy]], i8* %[[a]], align 8
+; CHECK-NEXT: {{.*}} = load i8, i8* %[[a]], align 8
 ; CHECK-NEXT: %[[a_cast:.*]] = bitcast i8* %[[a]] to i1*
 ; CHECK-NEXT: {{.*}} = load i1, i1* %[[a_cast]], align 8
 
diff --git a/test/Transforms/SROA/big-endian.ll b/test/Transforms/SROA/big-endian.ll
index b5a04ca8e64a..4de7bfcb898d 100644
--- a/test/Transforms/SROA/big-endian.ll
+++ b/test/Transforms/SROA/big-endian.ll
@@ -112,3 +112,126 @@ entry:
 ; CHECK-NEXT: %[[ret:.*]] = zext i56 %[[insert4]] to i64
 ; CHECK-NEXT: ret i64 %[[ret]]
 }
+
+define i64 @PR14132(i1 %flag) {
+; CHECK-LABEL: @PR14132(
+; Here we form a PHI-node by promoting the pointer alloca first, and then in
+; order to promote the other two allocas, we speculate the load of the
+; now-phi-node-pointer. In doing so we end up loading a 64-bit value from an i8
+; alloca. While this is a bit dubious, we were asserting on trying to
+; rewrite it. The trick is that the code using the value may carefully take
+; steps to only use the not-undef bits, and so we need to at least loosely
+; support this. This test is particularly interesting because how we handle
+; a load of an i64 from an i8 alloca is dependent on endianness.
+entry:
+  %a = alloca i64, align 8
+  %b = alloca i8, align 8
+  %ptr = alloca i64*, align 8
+; CHECK-NOT: alloca
+
+  %ptr.cast = bitcast i64** %ptr to i8**
+  store i64 0, i64* %a
+  store i8 1, i8* %b
+  store i64* %a, i64** %ptr
+  br i1 %flag, label %if.then, label %if.end
+
+if.then:
+  store i8* %b, i8** %ptr.cast
+  br label %if.end
+; CHECK-NOT: store
+; CHECK: %[[ext:.*]] = zext i8 1 to i64
+; CHECK: %[[shift:.*]] = shl i64 %[[ext]], 56
+
+if.end:
+  %tmp = load i64*, i64** %ptr
+  %result = load i64, i64* %tmp
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = phi i64 [ %[[shift]], %if.then ], [ 0, %entry ]
+
+  ret i64 %result
+; CHECK-NEXT: ret i64 %[[result]]
+}
+
+declare void @f(i64 %x, i32 %y)
+
+define void @test3() {
+; CHECK-LABEL: @test3(
+;
+; This is a test that specifically exercises the big-endian lowering because it
+; ends up splitting a 64-bit integer into two smaller integers and has a number
+; of tricky aspects (the i24 type) that make that hard. Historically, SROA
+; would miscompile this by either dropping a most significant byte or least
+; significant byte due to shrinking the [4,8) slice to an i24, or by failing to
+; move the bytes around correctly.
+;
+; The magical number 34494054408 is used because it has bits set in various
+; bytes so that it is clear if those bytes fail to be propagated.
+;
+; If you're debugging this, rather than using the direct magical numbers, run
+; the IR through '-sroa -instcombine'. With '-instcombine' these will be
+; constant folded, and if the i64 doesn't round-trip correctly, you've found
+; a bug!
+;
+entry:
+  %a = alloca { i32, i24 }, align 4
+; CHECK-NOT: alloca
+
+  %tmp0 = bitcast { i32, i24 }* %a to i64*
+  store i64 34494054408, i64* %tmp0
+  %tmp1 = load i64, i64* %tmp0, align 4
+  %tmp2 = bitcast { i32, i24 }* %a to i32*
+  %tmp3 = load i32, i32* %tmp2, align 4
+; CHECK: %[[HI_EXT:.*]] = zext i32 134316040 to i64
+; CHECK: %[[HI_INPUT:.*]] = and i64 undef, -4294967296
+; CHECK: %[[HI_MERGE:.*]] = or i64 %[[HI_INPUT]], %[[HI_EXT]]
+; CHECK: %[[LO_EXT:.*]] = zext i32 8 to i64
+; CHECK: %[[LO_SHL:.*]] = shl i64 %[[LO_EXT]], 32
+; CHECK: %[[LO_INPUT:.*]] = and i64 %[[HI_MERGE]], 4294967295
+; CHECK: %[[LO_MERGE:.*]] = or i64 %[[LO_INPUT]], %[[LO_SHL]]
+
+  call void @f(i64 %tmp1, i32 %tmp3)
+; CHECK: call void @f(i64 %[[LO_MERGE]], i32 8)
+  ret void
+; CHECK: ret void
+}
+
+define void @test4() {
+; CHECK-LABEL: @test4
+;
+; Much like @test3, this is specifically testing big-endian management of data.
+; Also similarly, it uses constants with particular bits set to help track
+; whether values are corrupted, and can be easily evaluated by running through
+; -instcombine to see that the i64 round-trips.
+;
+entry:
+  %a = alloca { i32, i24 }, align 4
+  %a2 = alloca i64, align 4
+; CHECK-NOT: alloca
+
+  store i64 34494054408, i64* %a2
+  %tmp0 = bitcast { i32, i24 }* %a to i8*
+  %tmp1 = bitcast i64* %a2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp0, i8* %tmp1, i64 8, i32 4, i1 false)
+; CHECK: %[[LO_SHR:.*]] = lshr i64 34494054408, 32
+; CHECK: %[[LO_START:.*]] = trunc i64 %[[LO_SHR]] to i32
+; CHECK: %[[HI_START:.*]] = trunc i64 34494054408 to i32
+
+  %tmp2 = bitcast { i32, i24 }* %a to i64*
+  %tmp3 = load i64, i64* %tmp2, align 4
+  %tmp4 = bitcast { i32, i24 }* %a to i32*
+  %tmp5 = load i32, i32* %tmp4, align 4
+; CHECK: %[[HI_EXT:.*]] = zext i32 %[[HI_START]] to i64
+; CHECK: %[[HI_INPUT:.*]] = and i64 undef, -4294967296
+; CHECK: %[[HI_MERGE:.*]] = or i64 %[[HI_INPUT]], %[[HI_EXT]]
+; CHECK: %[[LO_EXT:.*]] = zext i32 %[[LO_START]] to i64
+; CHECK: %[[LO_SHL:.*]] = shl i64 %[[LO_EXT]], 32
+; CHECK: %[[LO_INPUT:.*]] = and i64 %[[HI_MERGE]], 4294967295
+; CHECK: %[[LO_MERGE:.*]] = or i64 %[[LO_INPUT]], %[[LO_SHL]]
+
+  call void @f(i64 %tmp3, i32 %tmp5)
+; CHECK: call void @f(i64 %[[LO_MERGE]], i32 %[[LO_START]])
+  ret void
+; CHECK: ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
diff --git a/test/Transforms/SROA/phi-and-select.ll b/test/Transforms/SROA/phi-and-select.ll
index e97bd66d052a..fb76548b1d18 100644
--- a/test/Transforms/SROA/phi-and-select.ll
+++ b/test/Transforms/SROA/phi-and-select.ll
@@ -438,26 +438,26 @@ define i64 @PR14132(i1 %flag) {
 ; steps to only use the not-undef bits, and so we need to at least loosely
 ; support this..
 entry:
-  %a = alloca i64
-  %b = alloca i8
-  %ptr = alloca i64*
+  %a = alloca i64, align 8
+  %b = alloca i8, align 8
+  %ptr = alloca i64*, align 8
 ; CHECK-NOT: alloca
 
   %ptr.cast = bitcast i64** %ptr to i8**
-  store i64 0, i64* %a
-  store i8 1, i8* %b
-  store i64* %a, i64** %ptr
+  store i64 0, i64* %a, align 8
+  store i8 1, i8* %b, align 8
+  store i64* %a, i64** %ptr, align 8
   br i1 %flag, label %if.then, label %if.end
 
 if.then:
-  store i8* %b, i8** %ptr.cast
+  store i8* %b, i8** %ptr.cast, align 8
   br label %if.end
 ; CHECK-NOT: store
 ; CHECK: %[[ext:.*]] = zext i8 1 to i64
 
 if.end:
-  %tmp = load i64*, i64** %ptr
-  %result = load i64, i64* %tmp
+  %tmp = load i64*, i64** %ptr, align 8
+  %result = load i64, i64* %tmp, align 8
 ; CHECK-NOT: load
 ; CHECK: %[[result:.*]] = phi i64 [ %[[ext]], %if.then ], [ 0, %entry ]
 
diff --git a/test/Transforms/Scalarizer/cache-bug.ll b/test/Transforms/Scalarizer/cache-bug.ll
new file mode 100644
index 000000000000..f8c2d100d59a
--- /dev/null
+++ b/test/Transforms/Scalarizer/cache-bug.ll
@@ -0,0 +1,30 @@
+; RUN: opt -scalarizer -S < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+
+; Check that vector element 1 is scalarized correctly from a chain of
+; insertelement instructions
+define void @func(i32 %x) {
+; CHECK-LABEL: @func(
+; CHECK-NOT: phi i32 [ %x, %entry ], [ %inc.pos.y, %loop ]
+; CHECK:     phi i32 [ %inc, %entry ], [ %inc.pos.y, %loop ]
+; CHECK:   ret void
+entry:
+  %vecinit = insertelement <2 x i32> <i32 0, i32 0>, i32 %x, i32 1
+  %inc = add i32 %x, 1
+  %0 = insertelement <2 x i32> %vecinit, i32 %inc, i32 1
+  br label %loop
+
+loop:
+  %pos = phi <2 x i32> [ %0, %entry ], [ %new.pos.y, %loop ]
+  %i = phi i32 [ 0, %entry ], [ %new.i, %loop ]
+  %pos.y = extractelement <2 x i32> %pos, i32 1
+  %inc.pos.y = add i32 %pos.y, 1
+  %new.pos.y = insertelement <2 x i32> %pos, i32 %inc.pos.y, i32 1
+  %new.i = add i32 %i, 1
+  %cmp2 = icmp slt i32 %new.i, 1
+  br i1 %cmp2, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/tools/llvm-config/CMakeLists.txt b/tools/llvm-config/CMakeLists.txt
index d2ef45a16815..edbd8c950d7b 100644
--- a/tools/llvm-config/CMakeLists.txt
+++ b/tools/llvm-config/CMakeLists.txt
@@ -3,6 +3,11 @@ set(LLVM_LINK_COMPONENTS support)
 set(BUILDVARIABLES_SRCPATH ${CMAKE_CURRENT_SOURCE_DIR}/BuildVariables.inc.in)
 set(BUILDVARIABLES_OBJPATH ${CMAKE_CURRENT_BINARY_DIR}/BuildVariables.inc)
 
+# Add the llvm-config tool.
+add_llvm_tool(llvm-config
+  llvm-config.cpp
+  )
+
 # Compute the substitution values for various items.
 get_property(LLVM_SYSTEM_LIBS_LIST TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS)
 foreach(l ${LLVM_SYSTEM_LIBS_LIST})
@@ -10,12 +15,15 @@ foreach(l ${LLVM_SYSTEM_LIBS_LIST})
 endforeach()
 string(REPLACE ";" " " SYSTEM_LIBS "${SYSTEM_LIBS}")
 
+# Fetch target specific compile options, e.g. RTTI option
+get_property(COMPILE_FLAGS TARGET llvm-config PROPERTY COMPILE_FLAGS)
+
 # Use configure_file to create BuildVariables.inc.
 set(LLVM_SRC_ROOT ${LLVM_MAIN_SRC_DIR})
 set(LLVM_OBJ_ROOT ${LLVM_BINARY_DIR})
 set(LLVM_CPPFLAGS "${CMAKE_CPP_FLAGS} ${CMAKE_CPP_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
 set(LLVM_CFLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
-set(LLVM_CXXFLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
+set(LLVM_CXXFLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${COMPILE_FLAGS} ${LLVM_DEFINITIONS}")
 # Use the C++ link flags, since they should be a superset of C link flags.
 set(LLVM_LDFLAGS "${CMAKE_CXX_LINK_FLAGS}")
 set(LLVM_BUILDMODE ${CMAKE_BUILD_TYPE})
@@ -26,11 +34,6 @@ configure_file(${BUILDVARIABLES_SRCPATH} ${BUILDVARIABLES_OBJPATH} @ONLY)
 # Set build-time environment(s).
 add_definitions(-DCMAKE_CFG_INTDIR="${CMAKE_CFG_INTDIR}")
 
-# Add the llvm-config tool.
-add_llvm_tool(llvm-config
-  llvm-config.cpp
-  )
-
 # Add the dependency on the generation step.
 add_file_dependencies(${CMAKE_CURRENT_SOURCE_DIR}/llvm-config.cpp ${BUILDVARIABLES_OBJPATH})
 
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
index c7d4dd757cf8..85cabdbd41a4 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
@@ -339,14 +339,11 @@ TEST_F(MCJITCAPITest, simple_function) {
   buildMCJITOptions();
   buildMCJITEngine();
   buildAndRunPasses();
-  
-  union {
-    void *raw;
-    int (*usable)();
-  } functionPointer;
-  functionPointer.raw = LLVMGetPointerToGlobal(Engine, Function);
-  
-  EXPECT_EQ(42, functionPointer.usable());
+
+  auto *functionPointer = reinterpret_cast<int (*)()>(
+      reinterpret_cast<uintptr_t>(LLVMGetPointerToGlobal(Engine, Function)));
+
+  EXPECT_EQ(42, functionPointer());
 }
 
 TEST_F(MCJITCAPITest, gva) {
@@ -389,14 +386,11 @@ TEST_F(MCJITCAPITest, custom_memory_manager) {
   useRoundTripSectionMemoryManager();
   buildMCJITEngine();
   buildAndRunPasses();
-  
-  union {
-    void *raw;
-    int (*usable)();
-  } functionPointer;
-  functionPointer.raw = LLVMGetPointerToGlobal(Engine, Function);
-  
-  EXPECT_EQ(42, functionPointer.usable());
+
+  auto *functionPointer = reinterpret_cast<int (*)()>(
+      reinterpret_cast<uintptr_t>(LLVMGetPointerToGlobal(Engine, Function)));
+
+  EXPECT_EQ(42, functionPointer());
   EXPECT_TRUE(didCallAllocateCodeSection);
 }
 
@@ -412,14 +406,11 @@ TEST_F(MCJITCAPITest, stackmap_creates_compact_unwind_on_darwin) {
   useRoundTripSectionMemoryManager();
   buildMCJITEngine();
   buildAndRunOptPasses();
-  
-  union {
-    void *raw;
-    int (*usable)();
-  } functionPointer;
-  functionPointer.raw = LLVMGetPointerToGlobal(Engine, Function);
-  
-  EXPECT_EQ(42, functionPointer.usable());
+
+  auto *functionPointer = reinterpret_cast<int (*)()>(
+      reinterpret_cast<uintptr_t>(LLVMGetPointerToGlobal(Engine, Function)));
+
+  EXPECT_EQ(42, functionPointer());
   EXPECT_TRUE(didCallAllocateCodeSection);
   
   // Up to this point, the test is specific only to X86-64. But this next
@@ -446,21 +437,15 @@ TEST_F(MCJITCAPITest, reserve_allocation_space) {
   Options.MCJMM = wrap(MM);
   buildMCJITEngine();
   buildAndRunPasses();
-  
-  union {
-    void *raw;
-    int (*usable)();
-  } GetGlobalFct;
-  GetGlobalFct.raw = LLVMGetPointerToGlobal(Engine, Function);
-  
-  union {
-    void *raw;
-    void (*usable)(int);
-  } SetGlobalFct;
-  SetGlobalFct.raw = LLVMGetPointerToGlobal(Engine, Function2);
-  
-  SetGlobalFct.usable(789);
-  EXPECT_EQ(789, GetGlobalFct.usable());
+
+  auto GetGlobalFct = reinterpret_cast<int (*)()>(
+      reinterpret_cast<uintptr_t>(LLVMGetPointerToGlobal(Engine, Function)));
+
+  auto SetGlobalFct = reinterpret_cast<void (*)(int)>(
+      reinterpret_cast<uintptr_t>(LLVMGetPointerToGlobal(Engine, Function2)));
+
+  SetGlobalFct(789);
+  EXPECT_EQ(789, GetGlobalFct());
   EXPECT_LE(MM->UsedCodeSize, MM->ReservedCodeSize);
   EXPECT_LE(MM->UsedDataSizeRO, MM->ReservedDataSizeRO);
   EXPECT_LE(MM->UsedDataSizeRW, MM->ReservedDataSizeRW);
@@ -478,13 +463,47 @@ TEST_F(MCJITCAPITest, yield) {
   LLVMContextSetYieldCallback(C, yield, nullptr);
   buildAndRunPasses();
 
-  union {
-    void *raw;
-    int (*usable)();
-  } functionPointer;
-  functionPointer.raw = LLVMGetPointerToGlobal(Engine, Function);
+  auto *functionPointer = reinterpret_cast<int (*)()>(
+      reinterpret_cast<uintptr_t>(LLVMGetPointerToGlobal(Engine, Function)));
 
-  EXPECT_EQ(42, functionPointer.usable());
+  EXPECT_EQ(42, functionPointer());
   EXPECT_TRUE(didCallYield);
 }
 
+static int localTestFunc() {
+  return 42;
+}
+
+TEST_F(MCJITCAPITest, addGlobalMapping) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  Module = LLVMModuleCreateWithName("testModule");
+  LLVMSetTarget(Module, HostTriple.c_str());
+  LLVMTypeRef FunctionType = LLVMFunctionType(LLVMInt32Type(), NULL, 0, 0);
+  LLVMValueRef MappedFn = LLVMAddFunction(Module, "mapped_fn", FunctionType);
+
+  Function = LLVMAddFunction(Module, "test_fn", FunctionType);
+  LLVMBasicBlockRef Entry = LLVMAppendBasicBlock(Function, "");
+  LLVMBuilderRef Builder = LLVMCreateBuilder();
+  LLVMPositionBuilderAtEnd(Builder, Entry);
+  LLVMValueRef RetVal = LLVMBuildCall(Builder, MappedFn, NULL, 0, "");
+  LLVMBuildRet(Builder, RetVal);
+  LLVMDisposeBuilder(Builder);
+
+  LLVMVerifyModule(Module, LLVMAbortProcessAction, &Error);
+  LLVMDisposeMessage(Error);
+
+  buildMCJITOptions();
+  buildMCJITEngine();
+
+  LLVMAddGlobalMapping(
+      Engine, MappedFn,
+      reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(&localTestFunc)));
+
+  buildAndRunPasses();
+
+  uint64_t raw = LLVMGetFunctionAddress(Engine, "test_fn");
+  int (*usable)() = (int (*)()) raw;
+
+  EXPECT_EQ(42, usable());
+}
diff --git a/utils/emacs/README b/utils/emacs/README
index e83eeae4b070..d0f2bfbbb3e5 100644
--- a/utils/emacs/README
+++ b/utils/emacs/README
@@ -23,5 +23,5 @@ are:
 
 
 Note: If you notice missing or incorrect syntax highlighting, please contact
-<llvmbugs [at] cs.uiuc.edu>; if you wish to provide a patch to improve the
+<llvm-bugs [at] lists.llvm.org>; if you wish to provide a patch to improve the
 functionality, it will be most appreciated. Thank you.
diff --git a/utils/jedit/README b/utils/jedit/README
index 6a6c8c76cc32..00db9ecb16a5 100644
--- a/utils/jedit/README
+++ b/utils/jedit/README
@@ -10,5 +10,5 @@ These are syntax highlighting files for the jEdit editor. Included are:
   <MODE NAME="tablegen" FILE="tablegen.xml" FILE_NAME_GLOB="*.td" />
 
 Note: If you notice missing or incorrect syntax highlighting, please contact
-<llvmbugs [at] cs.uiuc.edu>; if you wish to provide a patch to improve the
+<llvm-bugs [at] lists.llvm.org>; if you wish to provide a patch to improve the
 functionality, it will be most appreciated. Thank you.
diff --git a/utils/kate/README b/utils/kate/README
index efe53b7e237e..46657e5d676a 100644
--- a/utils/kate/README
+++ b/utils/kate/README
@@ -8,5 +8,5 @@ These are syntax highlighting files for the Kate editor. Included are:
   this file to ~/.kde/share/apps/katepart/syntax (or better yet, symlink it).
 
 Note: If you notice missing or incorrect syntax highlighting, please contact
-<llvmbugs [at] cs.uiuc.edu>; if you wish to provide a patch to improve the
+<llvm-bugs [at] lists.llvm.org>; if you wish to provide a patch to improve the
 functionality, it will be most appreciated. Thank you.
diff --git a/utils/release/export.sh b/utils/release/export.sh
index 9aee306f9664..2fd4206b7406 100755
--- a/utils/release/export.sh
+++ b/utils/release/export.sh
@@ -14,7 +14,7 @@
 
 set -e
 
-projects="llvm cfe test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp"
+projects="llvm cfe test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp libunwind"
 base_url="https://llvm.org/svn/llvm-project"
 
 release=""
diff --git a/utils/release/tag.sh b/utils/release/tag.sh
index 8d6cd5f2f632..caefc7f7b71b 100755
--- a/utils/release/tag.sh
+++ b/utils/release/tag.sh
@@ -17,7 +17,7 @@ set -e
 release=""
 rc=""
 rebranch="no"
-projects="llvm cfe test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp"
+projects="llvm cfe test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp libunwind"
 dryrun=""
 revision="HEAD"
 
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index ee87b3f1987f..568c097badd3 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -18,8 +18,6 @@ else
     MAKE=make
 fi
 
-projects="llvm cfe compiler-rt libcxx libcxxabi test-suite clang-tools-extra"
-
 # Base SVN URL for the sources.
 Base_url="http://llvm.org/svn/llvm-project"
 
@@ -29,12 +27,17 @@ RC=""
 Triple=""
 use_gzip="no"
 do_checkout="yes"
-do_64bit="yes"
 do_debug="no"
 do_asserts="no"
 do_compare="yes"
+do_rt="yes"
+do_libs="yes"
+do_libunwind="yes"
+do_test_suite="yes"
+do_openmp="no"
 BuildDir="`pwd`"
-BuildTriple=""
+use_autoconf="no"
+ExtraConfigureFlags=""
 
 function usage() {
     echo "usage: `basename $0` -release X.Y.Z -rc NUM [OPTIONS]"
@@ -46,15 +49,24 @@ function usage() {
     echo " -j NUM               Number of compile jobs to run. [default: 3]"
     echo " -build-dir DIR       Directory to perform testing in. [default: pwd]"
     echo " -no-checkout         Don't checkout the sources from SVN."
-    echo " -no-64bit            Don't test the 64-bit version. [default: yes]"
     echo " -test-debug          Test the debug build. [default: no]"
     echo " -test-asserts        Test with asserts on. [default: no]"
     echo " -no-compare-files    Don't test that phase 2 and 3 files are identical."
     echo " -use-gzip            Use gzip instead of xz."
-    echo " -build-triple TRIPLE The build triple for this machine"
-    echo "                      [default: use config.guess]"
+    echo " -configure-flags FLAGS  Extra flags to pass to the configure step."
+    echo " -use-autoconf        Use autoconf instead of cmake"
+    echo " -no-rt               Disable check-out & build Compiler-RT"
+    echo " -no-libs             Disable check-out & build libcxx/libcxxabi/libunwind"
+    echo " -no-libunwind        Disable check-out & build libunwind"
+    echo " -no-test-suite       Disable check-out & build test-suite"
+    echo " -openmp              Check out and build the OpenMP run-time (experimental)"
 }
 
+if [ `uname -s` = "Darwin" ]; then
+  # compiler-rt doesn't yet build with CMake on Darwin.
+  use_autoconf="yes"
+fi
+
 while [ $# -gt 0 ]; do
     case $1 in
         -release | --release )
@@ -73,9 +85,9 @@ while [ $# -gt 0 ]; do
             shift
             Triple="$1"
             ;;
-        -build-triple | --build-triple )
+        -configure-flags | --configure-flags )
             shift
-            BuildTriple="$1"
+            ExtraConfigureFlags="$1"
             ;;
         -j* )
             NumJobs="`echo $1 | sed -e 's,-j\([0-9]*\),\1,g'`"
@@ -91,9 +103,6 @@ while [ $# -gt 0 ]; do
         -no-checkout | --no-checkout )
             do_checkout="no"
             ;;
-        -no-64bit | --no-64bit )
-            do_64bit="no"
-            ;;
         -test-debug | --test-debug )
             do_debug="yes"
             ;;
@@ -106,6 +115,24 @@ while [ $# -gt 0 ]; do
         -use-gzip | --use-gzip )
             use_gzip="yes"
             ;;
+        -use-autoconf | --use-autoconf )
+            use_autoconf="yes"
+            ;;
+        -no-rt )
+            do_rt="no"
+            ;;
+        -no-libs )
+            do_libs="no"
+            ;;
+        -no-libunwind )
+            do_libunwind="no"
+            ;;
+        -no-test-suite )
+            do_test_suite="no"
+            ;;
+        -openmp )
+            do_openmp="yes"
+            ;;
         -help | --help | -h | --h | -\? )
             usage
             exit 0
@@ -147,6 +174,24 @@ if [ -z "$NumJobs" ]; then
     NumJobs=3
 fi
 
+# Projects list
+projects="llvm cfe clang-tools-extra"
+if [ $do_rt = "yes" ]; then
+  projects="$projects compiler-rt"
+fi
+if [ $do_libs = "yes" ]; then
+  projects="$projects libcxx libcxxabi"
+  if [ $do_libunwind = "yes" ]; then
+    projects="$projects libunwind"
+  fi
+fi
+if [ $do_test_suite = "yes" ]; then
+  projects="$projects test-suite"
+fi
+if [ $do_openmp = "yes" ]; then
+  projects="$projects openmp"
+fi
+
 # Go to the build directory (may be different from CWD)
 BuildDir=$BuildDir/$RC
 mkdir -p $BuildDir
@@ -163,6 +208,16 @@ if [ $RC != "final" ]; then
 fi
 Package=$Package-$Triple
 
+# Errors to be highlighted at the end are written to this file.
+echo -n > $LogDir/deferred_errors.log
+
+function deferred_error() {
+  Phase="$1"
+  Flavor="$2"
+  Msg="$3"
+  echo "[${Flavor} Phase${Phase}] ${Msg}" | tee -a $LogDir/deferred_errors.log
+}
+
 # Make sure that a required program is available
 function check_program_exists() {
   local program="$1"
@@ -195,6 +250,10 @@ function export_sources() {
     check_valid_urls
 
     for proj in $projects ; do
+        if [ -d $proj.src ]; then
+          echo "# Reusing $proj $Release-$RC sources"
+          continue
+        fi
         echo "# Exporting $proj $Release-$RC sources"
         if ! svn export -q $Base_url/$proj/tags/RELEASE_$Release_no_dot/$RC $proj.src ; then
             echo "error: failed to export $proj project"
@@ -208,22 +267,26 @@ function export_sources() {
         ln -s ../../cfe.src clang
     fi
     cd $BuildDir/llvm.src/tools/clang/tools
-    if [ ! -h clang-tools-extra ]; then
+    if [ ! -h extra ]; then
         ln -s ../../../../clang-tools-extra.src extra
     fi
     cd $BuildDir/llvm.src/projects
-    if [ ! -h test-suite ]; then
+    if [ -d $BuildDir/test-suite.src ] && [ ! -h test-suite ]; then
         ln -s ../../test-suite.src test-suite
     fi
-    if [ ! -h compiler-rt ]; then
+    if [ -d $BuildDir/compiler-rt.src ] && [ ! -h compiler-rt ]; then
         ln -s ../../compiler-rt.src compiler-rt
     fi
-    if [ ! -h libcxx ]; then
+    if [ -d $BuildDir/libcxx.src ] && [ ! -h libcxx ]; then
         ln -s ../../libcxx.src libcxx
     fi
-    if [ ! -h libcxxabi ]; then
+    if [ -d $BuildDir/libcxxabi.src ] && [ ! -h libcxxabi ]; then
         ln -s ../../libcxxabi.src libcxxabi
     fi
+    if [ -d $BuildDir/libunwind.src ] && [ ! -h libunwind ]; then
+        ln -s ../../libunwind.src libunwind
+    fi
+
     cd $BuildDir
 }
 
@@ -233,17 +296,20 @@ function configure_llvmCore() {
     ObjDir="$3"
 
     case $Flavor in
-        Release | Release-64 )
-            Optimized="yes"
-            Assertions="no"
+        Release )
+            BuildType="Release"
+            Assertions="OFF"
+            ConfigureFlags="--enable-optimized --disable-assertions"
             ;;
         Release+Asserts )
-            Optimized="yes"
-            Assertions="yes"
+            BuildType="Release"
+            Assertions="ON"
+            ConfigureFlags="--enable-optimized --enable-assertions"
             ;;
         Debug )
-            Optimized="no"
-            Assertions="yes"
+            BuildType="Debug"
+            Assertions="ON"
+            ConfigureFlags="--disable-optimized --enable-assertions"
             ;;
         * )
             echo "# Invalid flavor '$Flavor'"
@@ -255,22 +321,33 @@ function configure_llvmCore() {
     echo "# Using C compiler: $c_compiler"
     echo "# Using C++ compiler: $cxx_compiler"
 
-    build_triple_option="${BuildTriple:+--build=$BuildTriple}"
-
     cd $ObjDir
     echo "# Configuring llvm $Release-$RC $Flavor"
-    echo "# $BuildDir/llvm.src/configure \
-        --enable-optimized=$Optimized \
-        --enable-assertions=$Assertions \
-        --disable-timestamps \
-        $build_triple_option"
-    env CC="$c_compiler" CXX="$cxx_compiler" \
-        $BuildDir/llvm.src/configure \
-        --enable-optimized=$Optimized \
-        --enable-assertions=$Assertions \
-        --disable-timestamps \
-        $build_triple_option \
-        2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
+
+    if [ "$use_autoconf" = "yes" ]; then
+        echo "#" env CC="$c_compiler" CXX="$cxx_compiler" \
+            $BuildDir/llvm.src/configure \
+            $ConfigureFlags --disable-timestamps $ExtraConfigureFlags \
+            2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
+        env CC="$c_compiler" CXX="$cxx_compiler" \
+            $BuildDir/llvm.src/configure \
+            $ConfigureFlags --disable-timestamps $ExtraConfigureFlags \
+            2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
+    else
+        echo "#" env CC="$c_compiler" CXX="$cxx_compiler" \
+            cmake -G "Unix Makefiles" \
+            -DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \
+            -DLLVM_ENABLE_TIMESTAMPS=OFF -DLLVM_CONFIGTIME="(timestamp not enabled)" \
+            $ExtraConfigureFlags $BuildDir/llvm.src \
+            2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
+        env CC="$c_compiler" CXX="$cxx_compiler" \
+            cmake -G "Unix Makefiles" \
+            -DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \
+            -DLLVM_ENABLE_TIMESTAMPS=OFF -DLLVM_CONFIGTIME="(timestamp not enabled)" \
+            $ExtraConfigureFlags $BuildDir/llvm.src \
+            2>&1 | tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
+    fi
+
     cd $BuildDir
 }
 
@@ -279,16 +356,11 @@ function build_llvmCore() {
     Flavor="$2"
     ObjDir="$3"
     DestDir="$4"
-    ExtraOpts=""
-
-    if [ "$Flavor" = "Release-64" ]; then
-        ExtraOpts="EXTRA_OPTIONS=-m64"
-    fi
 
     cd $ObjDir
     echo "# Compiling llvm $Release-$RC $Flavor"
-    echo "# ${MAKE} -j $NumJobs VERBOSE=1 $ExtraOpts"
-    ${MAKE} -j $NumJobs VERBOSE=1 $ExtraOpts \
+    echo "# ${MAKE} -j $NumJobs VERBOSE=1"
+    ${MAKE} -j $NumJobs VERBOSE=1 \
         2>&1 | tee $LogDir/llvm.make-Phase$Phase-$Flavor.log
 
     echo "# Installing llvm $Release-$RC $Flavor"
@@ -305,10 +377,19 @@ function test_llvmCore() {
     ObjDir="$3"
 
     cd $ObjDir
-    ${MAKE} -k check-all \
-        2>&1 | tee $LogDir/llvm.check-Phase$Phase-$Flavor.log
-    ${MAKE} -k unittests \
-        2>&1 | tee $LogDir/llvm.unittests-Phase$Phase-$Flavor.log
+    if ! ( ${MAKE} -j $NumJobs -k check-all \
+        2>&1 | tee $LogDir/llvm.check-Phase$Phase-$Flavor.log ) ; then
+      deferred_error $Phase $Flavor "check-all failed"
+    fi
+
+    if [ "$use_autoconf" = "yes" ]; then
+        # In the cmake build, unit tests are run as part of check-all.
+        if ! ( ${MAKE} -k unittests 2>&1 | \
+            tee $LogDir/llvm.unittests-Phase$Phase-$Flavor.log ) ; then
+          deferred_error $Phase $Flavor "unittests failed"
+        fi
+    fi
+
     cd $BuildDir
 }
 
@@ -321,10 +402,12 @@ function clean_RPATH() {
   local InstallPath="$1"
   for Candidate in `find $InstallPath/{bin,lib} -type f`; do
     if file $Candidate | grep ELF | egrep 'executable|shared object' > /dev/null 2>&1 ; then
-      rpath=`objdump -x $Candidate | grep 'RPATH' | sed -e's/^ *RPATH *//'`
-      if [ -n "$rpath" ]; then
-        newrpath=`echo $rpath | sed -e's/.*\(\$ORIGIN[^:]*\).*/\1/'`
-        chrpath -r $newrpath $Candidate 2>&1 > /dev/null 2>&1
+      if rpath=`objdump -x $Candidate | grep 'RPATH'` ; then
+        rpath=`echo $rpath | sed -e's/^ *RPATH *//'`
+        if [ -n "$rpath" ]; then
+          newrpath=`echo $rpath | sed -e's/.*\(\$ORIGIN[^:]*\).*/\1/'`
+          chrpath -r $newrpath $Candidate 2>&1 > /dev/null 2>&1
+        fi
       fi
     fi
   done
@@ -334,13 +417,53 @@ function clean_RPATH() {
 function package_release() {
     cwd=`pwd`
     cd $BuildDir/Phase3/Release
-    mv llvmCore-$Release-$RC.install $Package
+    mv llvmCore-$Release-$RC.install/usr/local $Package
     if [ "$use_gzip" = "yes" ]; then
       tar cfz $BuildDir/$Package.tar.gz $Package
     else
       tar cfJ $BuildDir/$Package.tar.xz $Package
     fi
-    mv $Package llvmCore-$Release-$RC.install
+    mv $Package llvmCore-$Release-$RC.install/usr/local
+    cd $cwd
+}
+
+# Build and package the OpenMP run-time. This is still experimental and not
+# meant for official testing in the release, but as a way for providing
+# binaries as a convenience to those who want to try it out.
+function build_OpenMP() {
+    cwd=`pwd`
+
+    rm -rf $BuildDir/Phase3/openmp
+    rm -rf $BuildDir/Phase3/openmp.install
+    mkdir -p $BuildDir/Phase3/openmp
+    cd $BuildDir/Phase3/openmp
+    clang=$BuildDir/Phase3/Release/llvmCore-$Release-$RC.install/usr/local/bin/clang
+
+    echo "#" cmake -DCMAKE_C_COMPILER=${clang} -DCMAKE_CXX_COMPILER=${clang}++ \
+            -DCMAKE_BUILD_TYPE=Release -DLIBOMP_MICRO_TESTS=on \
+            $BuildDir/openmp.src/runtime
+    cmake -DCMAKE_C_COMPILER=${clang} -DCMAKE_CXX_COMPILER=${clang}++ \
+            -DCMAKE_BUILD_TYPE=Release -DLIBOMP_MICRO_TESTS=on \
+            $BuildDir/openmp.src/runtime
+
+    echo "# Building OpenMP run-time"
+    echo "# ${MAKE} -j $NumJobs VERBOSE=1"
+    ${MAKE} -j $NumJobs VERBOSE=1
+    echo "# ${MAKE} libomp-micro-tests VERBOSE=1"
+    ${MAKE} libomp-micro-tests VERBOSE=1
+    echo "# ${MAKE} install DESTDIR=$BuildDir/Phase3/openmp.install"
+    ${MAKE} install DESTDIR=$BuildDir/Phase3/openmp.install
+
+    OpenMPPackage=OpenMP-$Release
+    if [ $RC != "final" ]; then
+        OpenMPPackage=$OpenMPPackage-$RC
+    fi
+    OpenMPPackage=$OpenMPPackage-$Triple
+
+    mv $BuildDir/Phase3/openmp.install/usr/local $BuildDir/$OpenMPPackage
+    cd $BuildDir
+    tar cvfJ $BuildDir/$OpenMPPackage.tar.xz $OpenMPPackage
+    mv $OpenMPPackage $BuildDir/Phase3/openmp.install/usr/local
     cd $cwd
 }
 
@@ -362,9 +485,6 @@ fi
 if [ "$do_asserts" = "yes" ]; then
     Flavors="$Flavors Release+Asserts"
 fi
-if [ "$do_64bit" = "yes" ]; then
-    Flavors="$Flavors Release-64"
-fi
 
 for Flavor in $Flavors ; do
     echo ""
@@ -379,7 +499,6 @@ for Flavor in $Flavors ; do
 
     c_compiler="$CC"
     cxx_compiler="$CXX"
-
     llvmCore_phase1_objdir=$BuildDir/Phase1/$Flavor/llvmCore-$Release-$RC.obj
     llvmCore_phase1_destdir=$BuildDir/Phase1/$Flavor/llvmCore-$Release-$RC.install
 
@@ -446,14 +565,23 @@ for Flavor in $Flavors ; do
     if [ "$do_compare" = "yes" ]; then
         echo
         echo "# Comparing Phase 2 and Phase 3 files"
-        for o in `find $llvmCore_phase2_objdir -name '*.o'` ; do
-            p3=`echo $o | sed -e 's,Phase2,Phase3,'`
-            if ! cmp --ignore-initial=16 $o $p3 > /dev/null 2>&1 ; then
-                echo "file `basename $o` differs between phase 2 and phase 3"
+        for p2 in `find $llvmCore_phase2_objdir -name '*.o'` ; do
+            p3=`echo $p2 | sed -e 's,Phase2,Phase3,'`
+            # Substitute 'Phase2' for 'Phase3' in the Phase 2 object file in
+            # case there are build paths in the debug info. On some systems,
+            # sed adds a newline to the output, so pass $p3 through sed too.
+            if ! cmp -s <(sed -e 's,Phase2,Phase3,g' $p2) <(sed -e '' $p3) \
+                    16 16 ; then
+                echo "file `basename $p2` differs between phase 2 and phase 3"
             fi
         done
     fi
 done
+
+if [ $do_openmp = "yes" ]; then
+  build_OpenMP
+fi
+
 ) 2>&1 | tee $LogDir/testing.$Release-$RC.log
 
 package_release
@@ -468,4 +596,13 @@ else
   echo "### Package: $Package.tar.xz"
 fi
 echo "### Logs: $LogDir"
+
+echo "### Errors:"
+if [ -s "$LogDir/deferred_errors.log" ]; then
+  cat "$LogDir/deferred_errors.log"
+  exit 1
+else
+  echo "None."
+fi
+
 exit 0
diff --git a/utils/unittest/googletest/README.LLVM b/utils/unittest/googletest/README.LLVM
index 16bfffd8d5e5..5f7fffe26a43 100644
--- a/utils/unittest/googletest/README.LLVM
+++ b/utils/unittest/googletest/README.LLVM
@@ -20,4 +20,4 @@ Modified as follows:
 * To GTestStreamToHelper in include/gtest/internal/gtest-internal.h,
   added the ability to stream with raw_os_ostream.
 * To refresh Haiku support in include/gtest/internal/gtest-port.h,
-  see http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20100621/102898.html
+  see http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20100621/102898.html
diff --git a/utils/vim/README b/utils/vim/README
index 1fe00992b0ab..8aea87924eb2 100644
--- a/utils/vim/README
+++ b/utils/vim/README
@@ -9,7 +9,7 @@ To install copy all subdirectories to your $HOME/.vim or if you prefer create
 symlinks to the files here. Do not copy the vimrc file here it is only meant as an inspiration and starting point for those working on llvm c++ code.
 
 Note: If you notice missing or incorrect syntax highlighting, please contact
-<llvmbugs [at] cs.uiuc.edu>; if you wish to provide a patch to improve the
+<llvm-bugs [at] lists.llvm.org>; if you wish to provide a patch to improve the
 functionality, it will be most appreciated. Thank you.
 
 If you find yourself working with LLVM Makefiles often, but you don't get syntax